def test_per(capacity): # test implementation of proritized replay buffer p_buffer = PrioritizedReplayBuffer(capacity) # populate the buffer for _ in range(capacity // 2): p_buffer.add(Experience()) # update batches of experience n_batches = 10 batch_size = 100 for _ in range(10): # randomly sample $batch_size of tree indices idx = random.sample([x for x in range(capacity - 1, 2 * capacity - 1)], batch_size) td_errors = np.random.uniform(0, 10, batch_size) p_buffer.batch_update(idx, td_errors) assert p_buffer.tree.max_priority == np.max( p_buffer.tree.tree[-capacity:]) # test sample for _ in range(10): p_buffer.sample(batch_size) return
def td_learning(args): agent = DQNAgent(args) replay_memory = PrioritizedReplayBuffer(1000000, args.alpha) #eval_game(agent, 500) outer = tqdm(range(args.total_steps), desc='Total steps', position=0) game = init_game() ave_score = 0 count = 0 for step in outer: board = copy.deepcopy(game.gameboard.board) if step < args.start_learn: avail_choices = game.gameboard.get_available_choices() index = np.random.randint(len(avail_choices)) choice = avail_choices[index] else: choice = agent.greedy_policy( board, game.gameboard.get_available_choices()) next_board, reward = game.input_pos(choice[0], choice[1]) next_board = copy.deepcopy(next_board) ##### replay_memory.add(board, choice, reward, next_board) ##### if game.termination(): ave_score += game.gameboard.score count += 1 game = init_game() if step >= args.start_learn and step % args.train_freq == 0: if count > 0: message = "ave score of " + str(count) + " game: " + str( ave_score / count) out_fd.write("{} {}\n".format(step, ave_score / count)) outer.write(message) ave_score = 0 count = 0 if step == args.start_learn: experience = replay_memory.sample(args.start_learn, beta=agent.beta) else: experience = replay_memory.sample(args.train_data_size, beta=agent.beta) boards, choices, rewards, next_boards, weights, batch_idxes = experience td_errors = agent.train( (boards, choices, rewards, next_boards, weights)) new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_memory.update_priorities(batch_idxes, new_priorities) agent.update_target(args.soft_tau) agent.update_epsilon() agent.update_beta() eval_game(agent, 500) out_fd.close()
def learn(env, args): ob = env.reset() ob_shape = ob.shape num_action = int(env.action_space.n) agent = TestAgent(ob_shape, num_action, args) replay_buffer = PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha) args.prioritized_replay_beta_iters = args.max_timesteps beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, initial_p=args.prioritized_replay_beta0, final_p=1.0) episode_rewards = [0.0] saved_mean_reward = None n_step_seq = [] agent.sample_noise() agent.update_target() for t in range(args.max_timesteps): action = agent.act(ob) new_ob, rew, done, _ = env.step(action) replay_buffer.add(ob, action, rew, new_ob, float(done)) ob = new_ob episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > args.learning_starts and t % args.replay_period == 0: experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t)) (obs, actions, rewards, obs_next, dones, weights, batch_idxes) = experience agent.sample_noise() kl_errors = agent.update(obs, actions, rewards, obs_next, dones, weights) replay_buffer.update_priorities(batch_idxes, np.abs(kl_errors) + 1e-6) if t > args.learning_starts and t % args.target_network_update_freq == 0: agent.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and args.print_freq is not None and len(episode_rewards) % args.print_freq == 0: print('steps {} episodes {} mean reward {}'.format(t, num_episodes, mean_100ep_reward))
class RL_AGENT_ONE(): """ RL agent class """ def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init, gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False ): self.step_now = 0 # record the step self.reward_num = 0 self.reward_accumulated = 0 # delay reward self.final_tem = 10 # just for now self.step_last_update = 0 # record the last update time self.update_period = update_period # for the off policy self.learn_start_time = learn_start_time self.gamma = gamma self.batch_size = batch_size self.memory_size = memory_size self.alpha = 0.6 self.beta = 0.4 self.replay_bata_iters = replay_iters self.replay_eps = 1e-6 self.memory_min_num = 1000 #she min num to learn self.step_last_learn = 0 # record the last learn step self.learn_fre = learn_fre # step frequency to learn self.e_greedy = 1 # record the e_greedy self.eps_T = eps_T # par for updating the maybe step 80,0000 self.eps_t_init = eps_t_init # par for updating the eps self.device = device self.model_path = model_path self.mode_enjoy = model_load if model_load == False: self.policy_net = DQN(board[0], board[1], action_num).to(device) self.target_net = DQN(board[0], board[1], action_num).to(device) self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr) self.loss_fn = nn.functional.mse_loss # use the l1 loss self.memory = PrioritizedReplayBuffer(memory_size, self.alpha) self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0) else: self.load(o_model_name) #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) self.obs_new = None self.obs_old = None self.action = None self.action_old = None self.dqn_direct_flag = False # show if the dqn action is done self.model_save_flag = False def reset(self): """ reset the flag, state, reward for a new half or game """ self.obs_new = None self.obs_old = None self.action = None self.dqn_direct_flag = False def load(self, old_model): """ load the trained model par: |old_model:str, the name of the old model """ model_path_t = self.model_path + 't' + old_model self.target_net = torch.load(model_path_t, map_location=self.device) self.target_net.eval() print('target net par', self.target_net.state_dict()) def save(self): """ save the trained model """ t = time.strftime('%m%d%H%M%S') self.model_path_p = self.model_path + 'p' + t + '.pt' self.model_path_t = self.model_path + 't' + t + '.pt' print('target net par is', self.policy_net.state_dict()) torch.save(self.policy_net, self.model_path_p) torch.save(self.target_net, self.model_path_t) def learn(self, env, step_now, obs_old, action, obs_new, reward, done): """ This func is used to learn the agent par: |step_now: int, the global time of training |env: class-Environment, use it for nothing |transition: action, obs_new, reward |obs_old/new: instance obs |done: bool, if the game is over """ """ check if we should update the policy net """ if step_now - self.step_last_update == self.update_period: self.step_last_update = step_now self.target_net.load_state_dict(self.policy_net.state_dict()) """ init the obs_new for init learn """ state_new = self.feature_combine(obs_new) # get the feature state state_old = self.feature_combine(obs_old) # get the feature state transition_now = (state_old, action, \ reward, state_new) """ augument reward data to the memory """ if reward > 0: self.memory.add(*self.data_augment(transition_now), done) self.memory.add(state_old, action, \ reward, state_new, done) """ select the batch memory to update the network """ step_diff = step_now - self.step_last_learn if step_now > self.learn_start_time and \ step_diff >= self.learn_fre and \ self.memory.__len__() > self.memory_min_num: self.step_last_learn = step_now # update the self.last learn batch_data = self.memory.sample(self.batch_size, \ beta=self.beta_schedule.value(step_now)) s_o_set, actions, rewards, s_n_set, dones, weights, idx_set = batch_data loss_list = [] batch_idx_list = [] reward_not_zero_cnt = 0 actions = [torch.tensor(a, device=self.device) \ for a in actions] """ cnt how many times learn for non reward """ actions_new = [self.policy_net(s_n).detach().max(0)[1] \ for s_n in s_n_set] target_values = [self.gamma*self.target_net(s_n).gather(0, actions_new[idx]) \ for idx, s_n in enumerate(s_n_set)] target_values = [t_*(1 - d_) + r_ \ for t_, d_, r_ in zip(target_values, dones, rewards)] policy_values = [self.policy_net(s).gather(0, a) \ for s, a in zip(s_o_set, actions)] loss = [self.loss_fn(p_v, t_v)+ self.replay_eps \ for p_v, t_v in zip(policy_values, target_values)] loss_back = sum(loss) / self.batch_size """ update the par """ self.optimizer.zero_grad() loss_back.backward() self.optimizer.step() self.memory.update_priorities(idx_set, torch.tensor(loss).detach().numpy()) """ check if we should save the model """ if self.model_save_flag == True: self.save() def select_egreedy(self, q_value, step_now): """ select the action by e-greedy policy arg: |q_value: the greedy standard """ self.e_greedy = np.exp((self.eps_t_init - step_now) / self.eps_T) if self.e_greedy < 0.3: self.e_greedy = 0.3 """ if we are in enjoying mode """ if self.mode_enjoy == True: print('q_value is', q_value) self.e_greedy = 0.3 """ select the action by e-greedy """ if np.random.random() > self.e_greedy: action = action_list[ \ np.where(q_value==np.max(q_value))[0][0] ] else: action = action_list[np.random.randint(action_num)] return action def feature_combine(self, obs): """ This file extract features from the obs.layers and combine them into a new feature layer Used feature layers: """ """ combine all the layers """ feature_c = obs.copy() feature_c = feature_c.astype(np.float32) feature_c = torch.tensor(feature_c, dtype=torch.float32, device=self.device) size = feature_c.shape feature_c = feature_c.resize_(1, 1, size[0], size[1]) return feature_c def data_augment(self, transition): """ use this func to flip the feature, to boost the experience, deal the problem of sparse reward par: |transition: tuple, with (feature_o, action, feature_n, reward) """ flip_ver_dim = 2 feature_old = transition[0] action = transition[1] feature_new = transition[3] reward = transition[2] """ vertical flip """ feature_o_aug = feature_old.flip([flip_ver_dim]) feature_n_aug = feature_new.flip([flip_ver_dim]) """ vertical :action flip """ if action == 0: action = 1 elif action == 1: action = 0 return feature_o_aug, action, reward, feature_n_aug def act(self, map, step_now): """ this func is interact with the competition func """ dqn_action = -1 # reset state_old = self.feature_combine(map) # get the feature q_values = self.policy_net(state_old) action = self.select_egreedy( \ q_values.cpu().detach().numpy(), step_now)# features to model return action def act_enjoy(self, map): """ this func is interact with the competition func """ dqn_action = -1 # reset step_now = self.eps_T state_old = self.feature_combine(map) # get the feature q_values = self.target_net(state_old) action = self.select_egreedy( \ q_values.cpu().detach().numpy(), step_now)# features to model return action
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, train_mode = True, **network_kwargs ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Examine environment parameters print(str(env)) # Set the default brain to work with default_brain = env.brain_names[0] brain = env.brains[default_brain] num_actions=brain.vector_action_space_size[0] # Create all the functions necessary to train the model sess = get_session() #set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph #observation_space = env.observation_space env_info = env.reset(train_mode=train_mode)[default_brain] state = get_obs_state_lidar(env_info) observation_space=state.copy() #def make_obs_ph(name,Num_action): # tf.placeholder(shape=(None,) + state.shape, dtype=state.dtype, name='st') # return tf.placeholder(tf.float32, shape = [None, Num_action],name=name) def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug =build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_variables(model_file) return act
class DuelingDoubleDQNagent(): def __init__(self): #self.action_space = [0, 1, 2, 3, 4, 5, 6] self.action_space = [i for i in range(4 * 7) ] # 28 grouped action : board 7x14 self.action_size = len(self.action_space) self.next_stone_size = 6 self.state_size = (rows + 1, cols, 1) self.discount_factor = 0.99 # 딥마인드의 논문에서는 PER을 사용하여 샘플링한 데이터는 학습되는 양이 크기 때문에 # 학습의 안정성을 위해 Learning rate를 기존 random uniform sample을 사용했을 때의 1/4 수준으로 줄였기에 이를 반영했습니다. #self.learning_rate = 0.00025 self.learning_rate = 0.0000625 self.epsilon = 0. #1. self.epsilon_min = 0.0 self.epsilon_decay = 1000000 #1000000 self.model = self.build_model() self.target_model = self.build_model() # custom loss function을 따로 정의하여 학습에 사용합니다. self.model_updater = self.model_optimizer() self.batch_size = 64 self.train_start = 50000 #50000 # PER 선언 및 관련 hyper parameter입니다. # beta는 importance sampling ratio를 얼마나 반영할지에 대한 수치입니다. # 정확한 의미는 아니지만 정말 추상적으로 설명드리면 # beta가 크다 -> PER을 사용함으로써 생기는 데이터 편향을 크게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 감소, 전체적인 학습은 조금더 안정적 # beta가 작다 -> PER을 사용함으로써 생기는 데이터 편향을 작게 보정하겠다 -> TD-error가 큰 데이터에 대한 학습량 증가, 전체적인 학습은 조금더 불안정 # 논문에서는 초기 beta를 0.4로 두고 학습이 끝날때까지 선형적으로 1까지 증가시킴. # alpha는 TD-error의 크기를 어느정도로 반영할지에 대한 파라미터입니다. 수식으로는 (TD-error)^alpha 로 표현됩니다. # alpha가 0에 가까울수록 TD-error의 크기를 반영하지 않는 것이고 기존의 uniform sampling에 가까워집니다. # alpha가 1에 가까울수록 TD-error의 크기를 반영하는 것이고 PER에 가까워집니다. # 논문에서는 alpha를 0.6으로 사용했습니다. # prioritized_replay_eps는 (TD-error)^alpha를 계산할때 TD-error가 0인 상황을 방지하기위해 TD-error에 더 해주는 아주작은 상수값 입니다. self.memory = PrioritizedReplayBuffer(1000000, alpha=0.6) #1000000 self.beta = 0.4 # 0.4 self.beta_max = 1.0 self.beta_decay = 2000000 #5000000 self.prioritized_replay_eps = 0.000001 # 텐서보드 설정 self.sess = tf.InteractiveSession() K.set_session(self.sess) self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/tetris_dqn', self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.load_model = True if self.load_model: self.model.load_weights("./DQN_tetris_model_0311.h5") self.imitation_mode = False # 각 에피소드 당 학습 정보를 기록 def setup_summary(self): episode_total_reward = tf.Variable(0.) episode_avg_max_q = tf.Variable(0.) episode_duration = tf.Variable(0.) episode_avg_loss = tf.Variable(0.) tf.summary.scalar('Total Reward/Episode', episode_total_reward) tf.summary.scalar('Total Clear Line/Episode', episode_avg_max_q) #tf.summary.scalar('Duration/Episode', episode_duration) #tf.summary.scalar('Average Loss/Episode', episode_avg_loss) #tf.train.AdamOptimizer summary_vars = [ episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss ] summary_placeholders = [ tf.placeholder(tf.float32) for _ in range(len(summary_vars)) ] update_ops = [ summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars)) ] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op def build_model(self): # Dueling DQN state = Input(shape=( self.state_size[0], self.state_size[1], self.state_size[2], )) layer = Conv2D(32, (5, 5), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(state) # 64, (4, 4) layer = Conv2D(32, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## layer = Conv2D(32, (1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## layer = Conv2D(32, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## layer = Conv2D(32, (1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer) ## pool_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='valid', data_format=None)(layer) layer_2 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_1) ## layer_2 = Conv2D(32, (1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer_2) ## layer_2 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(layer_2) pool_2 = MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='valid', data_format=None)(layer_2) layer_r = Conv2D(32, (rows + 1, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(state) layer_c = Conv2D(32, (1, cols), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(state) pool_1_r = Conv2D(32, (13, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_1) pool_1_c = Conv2D(32, (1, 5), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_1) pool_2_r = Conv2D(32, (12, 1), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_2) pool_2_c = Conv2D(32, (1, 4), strides=(1, 1), activation='relu', padding='same', kernel_initializer='he_uniform')(pool_2) layer = Flatten()(layer) layer_2 = Flatten()(layer_2) pool_1 = Flatten()(pool_1) pool_2 = Flatten()(pool_2) layer_r = Flatten()(layer_r) layer_c = Flatten()(layer_c) pool_1_r = Flatten()(pool_1_r) pool_1_c = Flatten()(pool_1_c) pool_2_r = Flatten()(pool_2_r) pool_2_c = Flatten()(pool_2_c) merge_layer = concatenate([ layer, layer_2, pool_1, pool_2, pool_1_c, pool_1_r, pool_2_c, pool_2_r, layer_c, layer_r ], axis=1) merge_layer = Dense(128, activation='relu', kernel_initializer='he_uniform')(merge_layer) vlayer = Dense(64, activation='relu', kernel_initializer='he_uniform')(merge_layer) alayer = Dense(64, activation='relu', kernel_initializer='he_uniform')(merge_layer) v = Dense(1, activation='linear', kernel_initializer='he_uniform')(vlayer) v = Lambda(lambda v: tf.tile(v, [1, self.action_size]))(v) a = Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')(alayer) a = Lambda(lambda a: a - tf.reduce_mean(a, axis=-1, keep_dims=True))(a) q = Add()([v, a]) model = Model(inputs=state, outputs=q) # custom loss 및 optimizer를 사용할 것이기에 complie 부분은 주석처리 합니다. # model.compile(loss='logcosh', optimizer=Adam(lr=self.learning_rate)) model.summary() return model def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) ''' def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = np.float32(state) q_values = self.model.predict(state) return np.argmax(q_values[0]) def get_action(self, env, state): if np.random.rand() <= self.epsilon: if env.new_stone_flag: return random.randrange(4) else: return random.randrange(self.action_size) else: state = np.float32(state) q_values = self.model.predict(state) return np.argmax(q_values[0]) ''' def get_action(self, env, state): if np.random.rand() <= self.epsilon: if env.stone_number(env.stone) == 1: return random.randrange(14) elif env.stone_number(env.stone) == 4 or env.stone_number( env.stone) == 6: return random.randrange(2) * 7 + random.randrange(6) elif env.stone_number(env.stone) == 2 or env.stone_number( env.stone) == 5 or env.stone_number(env.stone) == 7: return random.randrange(4) * 7 + random.randrange(6) elif env.stone_number(env.stone) == 3: return random.randrange(6) else: state = np.float32(state) q_values = self.model.predict(state) r_action = np.argmax(q_values[0]) return np.argmax(q_values[0]) def model_optimizer(self): target = K.placeholder(shape=[None, self.action_size]) weight = K.placeholder(shape=[ None, ]) # hubber loss에 대한 코드입니다. clip_delta = 1.0 pred = self.model.output err = target - pred cond = K.abs(err) < clip_delta squared_loss = 0.5 * K.square(err) linear_loss = clip_delta * (K.abs(err) - 0.5 * clip_delta) loss1 = tf.where(cond, squared_loss, linear_loss) # 기존 hubber loss에 importance sampling ratio를 곱하는 형태의 PER loss를 정의합니다. weighted_loss = tf.multiply(tf.expand_dims(weight, -1), loss1) loss = K.mean(weighted_loss, axis=-1) optimizer = Adam(lr=self.learning_rate) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, target, weight], [err], updates=updates) return train def train_model(self): (update_input, action, reward, update_target, done, weight, batch_idxes) = self.memory.sample(self.batch_size, beta=self.beta) target = self.model.predict(update_input) target_val = self.target_model.predict(update_target) target_val_arg = self.model.predict(update_target) # Double DQN for i in range(self.batch_size): if done[i]: target[i][action[i]] = reward[i] else: a = np.argmax(target_val_arg[i]) target[i][action[ i]] = reward[i] + self.discount_factor * target_val[i][a] # PER에서 mini-batch로 샘플링한 데이터에 대해 학습을 진행합니다. # 학습을 하는 과정에서 새롭게 계산된 TD-error를 다시 반영하기 위해 err는 따로 출력하여 저장합니다. err = self.model_updater([update_input, target, weight]) err = np.reshape(err, [self.batch_size, self.action_size]) # TD-error가 0이 되는것을 방지하기위해 작은 상수를 더해줍니다. new_priorities = np.abs(np.sum(err, axis=1)) + self.prioritized_replay_eps # 샘플링한 데이터에 대해 새롭게 계산된 TD-error를 업데이트 합니다. self.memory.update_priorities(batch_idxes, new_priorities)
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen): for j in range(np.shape(obses_t)[1] - windowLen): deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen): for k in range(np.shape(obses_t)[2] - windowLen): deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 convs=[(4,3,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True ) # parameters q_func=model lr=1e-3 # max_timesteps=100000 # max_timesteps=50000 max_timesteps=20000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1. target_network_update_freq=500 prioritized_replay=False # prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 deicticShape = (3,3,1) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size*25,) def make_match_ph(name): return U.BatchInput(matchShape, name=name) sess = U.make_session(num_cpu) sess.__enter__() # act, train, update_target, debug = build_graph.build_train( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( make_obs_ph=make_obs_ph, make_match_ph=make_match_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs,3) qvalues = getq(np.array(deicticObs)) action = np.argmax(np.max(qvalues,0)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3) obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])] _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True) # matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)] # td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
class DQNAgent(object): def __init__(self, stateShape, actionSpace, numPicks, memorySize, burnin=1000): self.numPicks = numPicks self.memorySize = memorySize self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6) self.stateShape = stateShape self.actionSpace = actionSpace self.step = 0 self.sync = 200 self.burnin = burnin self.alpha = 0.001 self.epsilon = 1 self.epsilon_decay = 0.5 self.epsilon_min = 0.01 self.eps_threshold = 0 self.gamma = 0.99 self.trainNetwork = self.createNetwork( stateShape, len(actionSpace), self.alpha) self.targetNetwork = self.createNetwork( stateShape, len(actionSpace), self.alpha) self.targetNetwork.set_weights( self.trainNetwork.get_weights()) def createNetwork(self, n_input, n_output, learningRate): model = keras.models.Sequential() model.add(keras.layers.Dense( 24, activation='relu', input_shape=n_input)) model.add(keras.layers.Dense(48, activation='relu')) model.add(keras.layers.Dense(n_output, activation='linear')) model.compile( loss='mse', optimizer=keras.optimizers.Adam(lr=learningRate)) print(model.summary()) return model def trainDQN(self): if len(self.replayMemory) <= self.numPicks or len(self.replayMemory) < self.burnin: return 0 beta = 0.4 + self.step * (1.0 - 0.4) / 300 samples = self.replayMemory.sample(self.numPicks, beta) #batch = Transition(*zip(*samples)) currStates, actions, rewards, nextStates, dones, weights, indices = samples currStates = np.squeeze(np.array(currStates), 1) Q_currents = self.trainNetwork(currStates, training=False).numpy() nextStates = np.squeeze(np.array(nextStates), 1) Q_futures = self.targetNetwork(nextStates, training=False).numpy().max(axis=1) rewards = np.array(rewards).reshape(self.numPicks,).astype(float) actions = np.array(actions).reshape(self.numPicks,).astype(int) dones = np.array(dones).astype(bool) notDones = (~dones).astype(float) dones = dones.astype(float) Q_currents_cp = deepcopy(Q_currents) Q_currents_cp[np.arange(self.numPicks), actions] = rewards * dones + (rewards + Q_futures * self.gamma)*notDones loss = tf.multiply(tf.pow(tf.subtract(Q_currents[np.arange(self.numPicks), actions], Q_currents_cp[np.arange(self.numPicks), actions]), 2), weights).numpy() prios = loss + 1e-5 self.replayMemory.update_priorities(indices, prios) loss = self.trainNetwork.train_on_batch(currStates, Q_currents) return loss def selectAction(self, state): self.step += 1 if self.step % self.sync == 0: self.targetNetwork.set_weights( self.trainNetwork.get_weights()) q = -100000 if np.random.rand(1) < self.epsilon: action = np.random.randint(0, 3) else: preds = np.squeeze(self.trainNetwork( state, training=False).numpy(), axis=0) action = np.argmax(preds) q = preds[action] return action, q def addMemory(self, state, action, reward, nextState, done): self.replayMemory.add(state, action, reward, nextState, done) def save(self): save_path = ( f"./mountain_car_tfngmo_{int(self.step)}.chkpt" ) self.trainNetwork.save( save_path ) print(f"MountainNet saved to {save_path} done!")
def learn(env, network, seed=None, lr=5e-5, total_timesteps=100000, buffer_size=500000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=10, checkpoint_freq=100000, checkpoint_path=None, learning_starts=0, gamma=0.99, target_network_update_freq=10000, prioritized_replay=True, prioritized_replay_alpha=0.4, prioritized_replay_beta0=0.6, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-3, param_noise=False, callback=None, load_path=None, load_idx=None, demo_path=None, n_step=10, demo_prioritized_replay_eps=1.0, pre_train_timesteps=750000, epsilon_schedule="constant", **network_kwargs): # Create all the functions necessary to train the model set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) with tf.device('/GPU:0'): model = DQfD(q_func=q_func, observation_shape=env.observation_space.shape, num_actions=env.action_space.n, lr=lr, grad_norm_clipping=10, gamma=gamma, param_noise=param_noise) # Load model from checkpoint if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) if load_idx is None: ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) else: ckpt.restore(manager.checkpoints[load_idx]) print("Restoring from {}".format(manager.checkpoints[load_idx])) # Setup demo trajectory assert demo_path is not None with open(demo_path, "rb") as f: trajectories = pickle.load(f) # Create the replay buffer replay_buffer = PrioritizedReplayBuffer(buffer_size, prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) temp_buffer = deque(maxlen=n_step) is_demo = True for epi in trajectories: for obs, action, rew, new_obs, done in epi: obs, new_obs = np.expand_dims( np.array(obs), axis=0), np.expand_dims(np.array(new_obs), axis=0) if n_step: temp_buffer.append((obs, action, rew, new_obs, done, is_demo)) if len(temp_buffer) == n_step: n_step_sample = get_n_step_sample(temp_buffer, gamma) replay_buffer.demo_len += 1 replay_buffer.add(*n_step_sample) else: replay_buffer.demo_len += 1 replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), float(is_demo)) logger.log("trajectory length:", replay_buffer.demo_len) # Create the schedule for exploration if epsilon_schedule == "constant": exploration = ConstantSchedule(exploration_final_eps) else: # not used exploration = LinearSchedule(schedule_timesteps=int( exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) model.update_target() # ============================================== pre-training ====================================================== start = time() num_episodes = 0 temp_buffer = deque(maxlen=n_step) for t in tqdm(range(pre_train_timesteps)): # sample and train experience = replay_buffer.sample(batch_size, beta=prioritized_replay_beta0) batch_idxes = experience[-1] if experience[6] is None: # for n_step = 0 obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple( map(tf.constant, experience[:6])) obses_tpn, rewards_n, dones_n = None, None, None weights = tf.constant(experience[-2]) else: obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple( map(tf.constant, experience[:-1])) td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train( obses_t, actions, rewards, obses_tp1, dones, is_demos, weights, obses_tpn, rewards_n, dones_n) # Update priorities new_priorities = np.abs(td_errors) + np.abs( n_td_errors) + demo_prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # Update target network periodically if t > 0 and t % target_network_update_freq == 0: model.update_target() # Logging elapsed_time = timedelta(time() - start) if print_freq is not None and t % 10000 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", 0) logger.record_tabular("max 100 episode reward", 0) logger.record_tabular("min 100 episode reward", 0) logger.record_tabular("demo sample rate", 1) logger.record_tabular("epsilon", 0) logger.record_tabular("loss_td", np.mean(loss_dq.numpy())) logger.record_tabular("loss_n_td", np.mean(loss_n.numpy())) logger.record_tabular("loss_margin", np.mean(loss_E.numpy())) logger.record_tabular("loss_l2", np.mean(loss_l2.numpy())) logger.record_tabular("losses_all", weighted_error.numpy()) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("pre_train", True) logger.record_tabular("elapsed time", elapsed_time) logger.dump_tabular() # ============================================== exploring ========================================================= sample_counts = 0 demo_used_counts = 0 episode_rewards = deque(maxlen=100) this_episode_reward = 0. best_score = 0. saved_mean_reward = None is_demo = False obs = env.reset() # Always mimic the vectorized env obs = np.expand_dims(np.array(obs), axis=0) reset = True for t in tqdm(range(total_timesteps)): if callback is not None: if callback(locals(), globals()): break kwargs = {} if not param_noise: update_eps = tf.constant(exploration.value(t)) update_param_noise_threshold = 0. else: # not used update_eps = tf.constant(0.) update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action, epsilon, _, _ = model.step(tf.constant(obs), update_eps=update_eps, **kwargs) action = action[0].numpy() reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. new_obs = np.expand_dims(np.array(new_obs), axis=0) if n_step: temp_buffer.append((obs, action, rew, new_obs, done, is_demo)) if len(temp_buffer) == n_step: n_step_sample = get_n_step_sample(temp_buffer, gamma) replay_buffer.add(*n_step_sample) else: replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), 0.) obs = new_obs # invert log scaled score for logging this_episode_reward += np.sign(rew) * (np.exp(np.sign(rew) * rew) - 1.) if done: num_episodes += 1 obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(this_episode_reward) reset = True if this_episode_reward > best_score: best_score = this_episode_reward ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, './best_model', max_to_keep=1) manager.save(t) logger.log("saved best model") this_episode_reward = 0.0 if t % train_freq == 0: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) batch_idxes = experience[-1] if experience[6] is None: # for n_step = 0 obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple( map(tf.constant, experience[:6])) obses_tpn, rewards_n, dones_n = None, None, None weights = tf.constant(experience[-2]) else: obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple( map(tf.constant, experience[:-1])) td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train( obses_t, actions, rewards, obses_tp1, dones, is_demos, weights, obses_tpn, rewards_n, dones_n) new_priorities = np.abs(td_errors) + np.abs( n_td_errors ) + demo_prioritized_replay_eps * is_demos + prioritized_replay_eps * ( 1. - is_demos) replay_buffer.update_priorities(batch_idxes, new_priorities) # for logging sample_counts += batch_size demo_used_counts += np.sum(is_demos) if t % target_network_update_freq == 0: # Update target network periodically. model.update_target() if t % checkpoint_freq == 0: save_path = checkpoint_path ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, save_path, max_to_keep=10) manager.save(t) logger.log("saved checkpoint") elapsed_time = timedelta(time() - start) if done and num_episodes > 0 and num_episodes % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", np.mean(episode_rewards)) logger.record_tabular("max 100 episode reward", np.max(episode_rewards)) logger.record_tabular("min 100 episode reward", np.min(episode_rewards)) logger.record_tabular("demo sample rate", demo_used_counts / sample_counts) logger.record_tabular("epsilon", epsilon.numpy()) logger.record_tabular("loss_td", np.mean(loss_dq.numpy())) logger.record_tabular("loss_n_td", np.mean(loss_n.numpy())) logger.record_tabular("loss_margin", np.mean(loss_E.numpy())) logger.record_tabular("loss_l2", np.mean(loss_l2.numpy())) logger.record_tabular("losses_all", weighted_error.numpy()) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("pre_train", False) logger.record_tabular("elapsed time", elapsed_time) logger.dump_tabular() return model
initial_state_v_history = [] print("Starts training on {}".format(next(agent.parameters()).device)) # populate the buffer with 128 samples init_size = 128 play_and_record(state, agent, env, exp_replay, init_size) for step in range(total_steps): agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps) # play for $T time steps and cache the exprs to the buffer _, state = play_and_record(state, agent, env, exp_replay, T) b_idx, obses_t, actions, rewards, obses_tp1, dones, weights = exp_replay.sample( batch_size) # td loss for each sample td_loss = compute_td_loss(states=obses_t, actions=actions, rewards=rewards, next_states=obses_tp1, is_done=dones, agent=agent, target_network=target_network, gamma=gamma, device=device, check_shapes=True) ''' A batch of samples from prioritized replay looks like: (states, actions, rewards, next_states, weights, is_done)
class PrioritizedDQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, update_every, update_mem_every, update_mem_par_every, experience_per_sampling, seed=25, epsilon=1, epsilon_min=0.01, eps_decay=0.999, compute_weights=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.update_every = update_every self.experience_per_sampling = experience_per_sampling self.update_mem_every = update_mem_every self.update_mem_par_every = update_mem_par_every self.seed = random.seed(seed) self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay self.compute_weights = compute_weights # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.scheduler = StepLR(self.optimizer, step_size=1, gamma=0.995) # Replay memory self.memory = PrioritizedReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.experience_per_sampling, self.seed, self.compute_weights) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % self.update_every self.t_step_mem = (self.t_step_mem + 1) % self.update_mem_every self.t_step_mem_par = (self.t_step_mem_par + 1) % self.update_mem_par_every if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # If enough samples are available in memory, get random subset and learn if self.memory.experience_count > self.experience_per_sampling: sampling = self.memory.sample() self.learn(sampling) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ self.epsilon = max(self.epsilon * self.eps_decay, self.epsilon_min) state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) #print(action_values) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > self.epsilon: #print(np.argmax(action_values.cpu().data.numpy())) return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, sampling): """Update value parameters using given batch of experience tuples. Params ====== sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, indices = sampling # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) if self.compute_weights: with torch.no_grad(): weight = sum(np.multiply(weights, loss.data.cpu().numpy())) loss *= weight # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.scheduler.step() self.learn_steps += 1 # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) # ------------------- update priorities ------------------- # delta = abs(Q_targets - Q_expected.detach()).numpy() self.memory.update_priorities(delta, indices) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
obs).argmax().item() if use_gpu else behavior_model( torch.as_tensor(obs, device=device)).argmax().item() obs_new, reward, done = skip_action(action, env, skip_steps) tot_reward += reward obs_new = torch.as_tensor(obs_new, device=device) if use_gpu else obs_new memory.add(obs, action, reward, obs_new, done) obs = obs_new if step_counter > warmup: beta_per += beta_per_rate if use_per: states, actions, rewards, states_next, dones, weights, idxes = memory.sample( BS, beta_per) # states, actions, rewards, states_next, dones = data else: states, actions, rewards, states_next, dones = memory.sample( BS) states = torch.stack(states).squeeze( ) if use_gpu else torch.tensor(states, device=device).squeeze() states_next = torch.stack( states_next).squeeze() if use_gpu else torch.tensor( states_next, device=device).squeeze() actions_tensor = torch.tensor(actions, device=device) dones_tensor = torch.tensor(dones, device=device) rewards_tensor = torch.tensor(rewards, device=device) targets = behavior_model(states) with torch.no_grad():
class DQNAgent(object): def __init__( self, stateShape, actionSpace, numPicks, memorySize, numRewards, sync=50, burnin=0, #500, alpha=0.0001, epsilon=1, epsilon_decay=0.9995, epsilon_min=0.01, gamma=0.99, ): self.numPicks = numPicks self.replayMemory = PrioritizedReplayBuffer(memorySize, 0.6) self.stateShape = stateShape self.actionSpace = actionSpace self.step = 0 self.sync = sync self.burnin = burnin self.alpha = alpha self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min self.gamma = gamma self.walpha = 0.01 self.delay = 1 self.numRewards = numRewards self.trainNetwork = self.createNetwork(stateShape, len(actionSpace), self.alpha) self.targetNetwork = self.createNetwork(stateShape, len(actionSpace), self.alpha) self.targetNetwork.set_weights(self.trainNetwork.get_weights()) def createNetwork(self, n_input, n_output, learningRate): model = keras.models.Sequential() model.add( keras.layers.experimental.preprocessing.Rescaling( 1.0 / 255, input_shape=n_input)) model.add( keras.layers.Conv2D(32, kernel_size=8, strides=4, activation="relu")) model.add( keras.layers.Conv2D(64, kernel_size=4, strides=2, activation="relu")) model.add( keras.layers.Conv2D(64, kernel_size=3, strides=1, activation="relu")) model.add(keras.layers.Flatten()) model.add(keras.layers.Dense(512, activation="linear")) model.add(keras.layers.Dense(n_output, activation="linear")) model.compile(loss=keras.losses.Huber(), optimizer=keras.optimizers.Adam(lr=learningRate)) print(model.summary()) return model def trainDQN(self): if self.step <= self.numPicks or len(self.replayMemory) <= self.burnin: return 0 self.beta = 0.4 + self.step * (1.0 - 0.4) / 30000 samples = self.replayMemory.sample(self.numPicks, self.beta) currStates, actions, rewards, nextStates, dones, weights, indices = samples currStates = np.array(currStates).transpose(0, 2, 3, 1) Q_currents = self.trainNetwork(currStates, training=False).numpy() nextStates = np.array(nextStates).transpose(0, 2, 3, 1) Q_futures = self.targetNetwork(nextStates, training=False).numpy().max(axis=1) rewards = (np.array(rewards).reshape(self.numPicks, ).astype(float)) actions = (np.array(actions).reshape(self.numPicks, ).astype(int)) dones = np.squeeze(np.array(dones)).astype(bool) notDones = (~dones).astype(float) dones = dones.astype(float) Q_currents_cp = deepcopy(Q_currents) Q_currents_cp[np.arange(self.numPicks), actions] = (rewards + Q_futures * self.gamma * notDones) h = tf.keras.losses.Huber() loss = h( Q_currents[np.arange(self.numPicks), actions], Q_currents_cp[np.arange(self.numPicks), actions], ) prios = (np.abs(loss) * weights) + 1e-5 self.replayMemory.update_priorities(indices, prios) loss = self.trainNetwork.train_on_batch(currStates, Q_currents_cp) return loss def selectAction(self, state): self.step += 1 self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min) q = -100000 if np.random.rand(1) < self.epsilon: action = np.random.randint(0, 3) else: preds = np.squeeze( self.trainNetwork( np.expand_dims(np.array(state).transpose(1, 2, 0), 0), training=False, ).numpy(), axis=0, ) action = np.argmax(preds) q = preds[action] return action, q def addMemory(self, state, action, reward, nextState, done): self.replayMemory.add(state, action, reward, nextState, done) def save(self): save_path = f"./dst_net_{int(self.step)}.chkpt" train_w = self.trainNetwork.get_weights() target_w = self.trainNetwork.get_weights() with open(save_path, "wb") as f: pickle.dump([train_w, target_w], f) print(f"DSTNet saved to {save_path} done!") def load(self): save_path = "./dst_net_mixed.chkpt" with open(save_path, "rb") as f: weights = pickle.load(f) self.trainNetwork.set_weights(weights[0]) self.trainNetwork.set_weights(weights[1])
def learn(env_id, q_func, lr=5e-4, max_timesteps=10000, buffer_size=5000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, train_steps=10, learning_starts=500, batch_size=32, print_freq=10, checkpoint_freq=100, model_dir=None, gamma=1.0, target_network_update_freq=50, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, player_processes=None, player_connections=None): env, _, _ = create_gvgai_environment(env_id) # Create all the functions necessary to train the model # expert_decision_maker = ExpertDecisionMaker(env=env) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) session = tf.Session() session.__enter__() policy_path = os.path.join(model_dir, "Policy.pkl") model_path = os.path.join(model_dir, "model", "model") if os.path.isdir(os.path.join(model_dir, "model")): load_state(model_path) else: act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Initialize the parameters and copy them to the target network. U.initialize() update_target() act.save(policy_path) save_state(model_path) env.close() # Create the replay buffer if prioritized_replay: replay_buffer_path = os.path.join(model_dir, "Prioritized_replay.pkl") if os.path.isfile(replay_buffer_path): with open(replay_buffer_path, 'rb') as input_file: replay_buffer = pickle.load(input_file) else: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_path = os.path.join(model_dir, "Normal_replay.pkl") if os.path.isfile(replay_buffer_path): with open(replay_buffer_path, 'rb') as input_file: replay_buffer = pickle.load(input_file) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) episode_rewards = list() saved_mean_reward = -999999999 signal.signal(signal.SIGQUIT, signal_handler) global terminate_learning total_timesteps = 0 for timestep in range(max_timesteps): if terminate_learning: break for connection in player_connections: experiences, reward = connection.recv() episode_rewards.append(reward) for experience in experiences: replay_buffer.add(*experience) total_timesteps += 1 if total_timesteps < learning_starts: if timestep % 10 == 0: print("not strated yet", flush=True) continue if timestep % train_freq == 0: for i in range(train_steps): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(total_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if timestep % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if print_freq is not None and timestep % print_freq == 0: logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(total_timesteps))) logger.dump_tabular() if timestep % checkpoint_freq == 0 and mean_100ep_reward > saved_mean_reward: act.save(policy_path) save_state(model_path) saved_mean_reward = mean_100ep_reward with open(replay_buffer_path, 'wb') as output_file: pickle.dump(replay_buffer, output_file, pickle.HIGHEST_PROTOCOL) send_message_to_all(player_connections, Message.UPDATE) send_message_to_all(player_connections, Message.TERMINATE) if mean_100ep_reward > saved_mean_reward: act.save(policy_path) with open(replay_buffer_path, 'wb') as output_file: pickle.dump(replay_buffer, output_file, pickle.HIGHEST_PROTOCOL) for player_process in player_processes: player_process.join() # player_process.terminate() return act.load(policy_path)
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen + 1): for j in range(np.shape(obses_t)[1] - windowLen + 1): deicticObses_t.append(obses_t[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen + 1): for k in range(np.shape(obses_t)[2] - windowLen + 1): deicticObses_t.append(obses_t[i, j:j + windowLen, k:k + windowLen, :]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i, j:j + windowLen, k:k + windowLen, :]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array( deicticObses_tp1), np.array(deicticWeights) # Get deictic patch and action groupings # input: obses_deic, actions_deic -> Nx.. a bunch of deictic patches and actions # output: groups -> assignment of each row in obses_deic, actions_deic to a group # def getDeicticGroups(obses_deic, actions_deic, max_num_groups): def getDeicticGroups(obses_deic, max_num_groups): # create groups of equal obs/actions shape = np.shape(obses_deic) obses_deic_flat = np.reshape(obses_deic, [shape[0], shape[1] * shape[2]]) _, group_matching, group_counts = np.unique(obses_deic_flat, axis=0, return_inverse=True, return_counts=True) # obses_actions_deic_flat = np.c_[obses_deic_flat,actions_deic] # _, group_matching, group_counts = np.unique(obses_actions_deic_flat,axis=0,return_inverse=True,return_counts=True) # # take max_num_groups of most frequent groups # group_indices = np.float32(np.r_[np.array([group_counts]),np.array([range(np.shape(group_counts)[0])])]) # group_indices[0] = group_indices[0] + np.random.random(np.shape(group_indices)[1])*0.1 # add small random values to randomize sort order for equal numbers # group_indices_sorted = group_indices[:,group_indices[0,:].argsort()] # group_indices_to_keep = np.int32(group_indices_sorted[1,-max_num_groups:]) # # # Replace group numbers with new numbers in 0:max_num_groups # # All elts with group=max_num_groups have no group. # new_group_matching = np.ones(np.shape(group_matching)[0])*max_num_groups # for i in range(np.shape(group_indices_to_keep)[0]): # idx = np.nonzero(group_matching == group_indices_to_keep[i]) # new_group_matching[idx] = i # # # Get final list of groups. Get observations, actions corresponding to each group # groups,idx = np.unique(new_group_matching,return_index=True) # groups_idx = np.r_[np.array([groups]),np.array([idx])] # groups_idx_sorted = groups_idx[:,groups_idx[0].argsort()] # groups = groups_idx_sorted[0] # idx = np.int32(groups_idx_sorted[1,:-1]) # group_obs = obses_deic_flat[idx] # group_actions = actions_deic[idx] # # # reshape output observations # obsshape = np.shape(group_obs) # group_obs = np.reshape(group_obs,(obsshape[0],np.int32(np.sqrt(obsshape[1])),np.int32(np.sqrt(obsshape[1])),1)) # Get final list of groups. Get observations, actions corresponding to each group groups, idx = np.unique(group_matching, return_index=True) group_obs = obses_deic_flat[idx] # reshape output observations obsshape = np.shape(group_obs) group_obs = np.reshape(group_obs, (obsshape[0], shape[1], shape[2], shape[3])) # return new_group_matching, group_obs, group_actions # return group_matching, group_obs return group_matching # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 # convs=[(8,3,1)], # used for deictic TestRob3-v0 convs=[(16, 3, 1)], # used for deictic TestRob3-v0 # convs=[(4,3,1)], # used for deictic TestRob3-v0 # convs=[(16,3,1)], # used for deictic TestRob3-v0 # convs=[(8,2,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True) # model = models.mlp([6]) # parameters q_func = model lr = 1e-3 # lr=1e-4 # max_timesteps=100000 # max_timesteps=50000 max_timesteps = 20000 buffer_size = 50000 # exploration_fraction=0.1 exploration_fraction = 0.2 exploration_final_eps = 0.02 # exploration_final_eps=0.005 # exploration_final_eps=0.1 print_freq = 10 checkpoint_freq = 10000 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 prioritized_replay = False # prioritized_replay=True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # batch_size=32 # train_freq=1 batch_size = 64 train_freq = 2 # batch_size=128 # train_freq=4 # batch_size=256 # train_freq=4 # batch_size=512 # train_freq=8 # batch_size=1024 # train_freq=8 # batch_size=2048 # train_freq=8 # batch_size=4096 # train_freq=8 max_num_groups = 600 # deicticShape must be square. # These two parameters need to be consistent w/ each other. # deicticShape = (2,2,1) # num_deictic_patches=36 deicticShape = (3, 3, 1) num_deictic_patches = 36 # deicticShape = (4,4,1) # num_deictic_patches=25 # deicticShape = (5,5,1) # num_deictic_patches=16 # deicticShape = (6,6,1) # num_deictic_patches=9 # deicticShape = (7,7,1) # num_deictic_patches=4 # deicticShape = (8,8,1) # num_deictic_patches=1 num_actions = 4 tabularQ = 100 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) OHEnc = np.identity(max_num_groups) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size * 25, ) def make_match_ph(name): return U.BatchInput(matchShape, name=name) def parallelUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, group_matching, dones, q_tp1, batch_size, num_deictic_patches, max_num_groups): q_tp1_target = rewards + gamma * np.max( np.reshape(np.max(q_tp1, 1), [batch_size, num_deictic_patches]), 1) q_tp1_target = (1 - dones) * q_tp1_target group_matching_onehot = OHEnc[group_matching] desc_2_state = np.max( np.reshape(group_matching_onehot, [batch_size, num_deictic_patches, max_num_groups]), 1) max_target = np.max(q_tp1_target) target_min_per_D = np.min( desc_2_state * np.tile(np.reshape(q_tp1_target, [batch_size, 1]), [1, max_num_groups]) + (1 - desc_2_state) * max_target, 0) # I noticed that the line below produces unpredictable behavior. The dotprod does not seem to produce consistent results for some reason. Use the line below that instead. # targets1 = np.dot(group_matching_onehot,target_min_per_D) targets = np.sum( group_matching_onehot * np.tile(np.reshape(target_min_per_D, [1, max_num_groups]), [batch_size * num_deictic_patches, 1]), 1) D_2_DI = group_matching_onehot return q_tp1_target, desc_2_state, target_min_per_D, D_2_DI, targets sess = U.make_session(num_cpu) sess.__enter__() # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min_streamlined( # getq, trainWOUpdate = build_graph.build_train_deictic_min_streamlined( getq, train, trainWOUpdate, update_target = build_graph.build_train_deictic_min_streamlined( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, batch_size=batch_size, num_deictic_patches=num_deictic_patches, max_num_groups=max_num_groups, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, double_q=False) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs, deicticShape[0]) # qvalues = getq(np.array(deicticObs)) stateCurr = convertState(deicticObs) qvalues = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] action = np.argmax(np.max(qvalues, 0)) selPatch = np.argmax(np.max(qvalues, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) # env.render() new_obs, rew, done, _ = env.step(action) # display state, action, nextstate if t > 20000: toDisplay = np.reshape(new_obs, (8, 8)) toDisplay[ np. int32(np.floor_divide(selPatch, np.sqrt(num_deictic_patches))), np.int32(np.remainder(selPatch, np.sqrt(num_deictic_patches)) )] = 50 print( "Current/next state. 50 denotes the upper left corner of the deictic patch." ) print(str(toDisplay)) # env.render() # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > 20000: print("q-values:") print(str(qvalues)) print("*** Episode over! ***\n\n") if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic( obses_t, actions, obses_tp1, weights, deicticShape[0]) group_matching = getDeicticGroups(obses_t_deic, max_num_groups) stateCurr = convertState(obses_t_deic) stateNext = convertState(obses_tp1_deic) q_tp1 = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3], :] # q_tp1_target_parallel, desc_2_state_parallel, target_min_per_D_parallel, D_2_DI_parallel, targets_parallel = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, group_matching, dones, q_tp1) q_tp1_target, desc_2_state, target_min_per_D, D_2_DI, targets = parallelUpdate( obses_t_deic, actions_deic, rewards, obses_tp1_deic, group_matching, dones, q_tp1, batch_size, num_deictic_patches, max_num_groups) targets_simple = np.reshape( np.tile(np.reshape(q_tp1_target, [batch_size, 1]), [1, num_deictic_patches]), batch_size * num_deictic_patches) tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], actions_deic] = np.minimum( targets_simple, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], actions_deic]) # print("Num unique descriptors in batch: " + str(np.shape(np.unique(group_matching))[0])) # # for i in range(np.shape(obses_t_deic_small)[0]): # if i in agent_pos[0]: # # ax = agent_pos[np.nonzero(agent_pos[0] == i)[0][0]] # ax # if prioritized_replay: # new_priorities = np.abs(td_errors) + prioritized_replay_eps # replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) print("best patch:\n" + str(np.squeeze(deicticObs[np.argmax(np.max(qvalues, 1))]))) print("worst patch:\n" + str(np.squeeze(deicticObs[np.argmin(np.max(qvalues, 1))]))) # if t > learning_starts: # print("max td_error: " + str(np.sort(td_error)[-10:])) num2avg = 20 rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, callback=None, tf_log_dir=None, tf_flush_freq=100, tf_model_freq=10000 ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() # inject some Tensorboard usage. tf_summary_writer = tf.summary.FileWriter('{}/summary'.format(tf_log_dir)) if tf_log_dir is not None else None tf_saver = tf.train.Saver(max_to_keep=10) if tf_log_dir is not None else None update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") print('====', model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done and tf_summary_writer is not None: summary = tf.Summary() summary.value.add(tag='info/episode_reward', simple_value=float(episode_rewards[-1])) summary.value.add(tag='info/esp', simple_value=float(update_eps)) tf_summary_writer.add_summary(summary, t) if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if tf_summary_writer is not None: summary = tf.Summary() summary.value.add(tag='model/loss', simple_value=float(td_errors[0])) # TODO: mean the loss tf_summary_writer.add_summary(summary, t) if t % tf_flush_freq == 0: tf_summary_writer.flush() if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) logger.log("Saving model path: {}".format(model_file)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if tf_saver is not None and t % tf_model_freq == 0: assert tf_log_dir is not None tf_saver.save(sess=sess, save_path='{}/model/model'.format(tf_log_dir), global_step=t) if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
class Agent: def __init__(self, net, actionSet, goalSet, defaultNSample, defaultRandomPlaySteps, controllerMemCap, explorationSteps, trainFreq, hard_update, controllerEpsilon=defaultControllerEpsilon): self.actionSet = actionSet self.controllerEpsilon = controllerEpsilon self.goalSet = goalSet self.nSamples = defaultNSample self.gamma = defaultGamma self.net = net self.memory = PrioritizedReplayBuffer(controllerMemCap, alpha=prioritized_replay_alpha) self.enable_double_dqn = True self.exploration = LinearSchedule(schedule_timesteps = explorationSteps, initial_p = 1.0, final_p = 0.02) self.defaultRandomPlaySteps = defaultRandomPlaySteps self.trainFreq = trainFreq self.randomPlay = True self.learning_done = False self.hard_update = hard_update def selectMove(self, state): if not self.learning_done: if self.controllerEpsilon < random.random(): return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0)) #return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4)), dummyYtrue, dummyMask], verbose=0)[1]) return random.choice(self.actionSet) else: return np.argmax(self.simple_net.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0)) def setControllerEpsilon(self, epsilonArr): self.controllerEpsilon = epsilonArr def criticize(self, reachGoal, action, die, distanceReward, useSparseReward): reward = 0.0 if reachGoal: reward += 1.0 #reward += 50.0 if die: reward -= 1.0 if not useSparseReward: reward += distanceReward reward = np.minimum(reward, maxReward) reward = np.maximum(reward, minReward) return reward def store(self, experience): self.memory.add(experience.state, experience.action, experience.reward, experience.next_state, experience.done) #self.memory.add(np.abs(experience.reward), experience) def compile(self): def huber_loss(y_true, y_pred, clip_value): assert clip_value > 0. x = y_true - y_pred if np.isinf(clip_value): return .5 * K.square(x) condition = K.abs(x) < clip_value squared_loss = .5 * K.square(x) linear_loss = clip_value * (K.abs(x) - .5 * clip_value) if K.backend() == 'tensorflow': import tensorflow as tf if hasattr(tf, 'select'): return tf.select(condition, squared_loss, linear_loss) # condition, true, false else: return tf.where(condition, squared_loss, linear_loss) # condition, true, false elif K.backend() == 'theano': from theano import tensor as T return T.switch(condition, squared_loss, linear_loss) else: raise RuntimeError('Unknown backend "{}".'.format(K.backend())) def clipped_masked_error(args): y_true, y_pred, mask = args loss = huber_loss(y_true, y_pred, 1) loss *= mask # apply element-wise mask return K.sum(loss, axis=-1) # Create trainable model. The problem is that we need to mask the output since we only # ever want to update the Q values for a certain action. The way we achieve this is by # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility # to mask out certain parameters by passing in multiple inputs to the Lambda layer. y_pred = self.net.controllerNet.output y_true = Input(name='y_true', shape=(nb_Action,)) mask = Input(name='mask', shape=(nb_Action,)) loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask]) ins = [self.net.controllerNet.input] if type(self.net.controllerNet.input) is not list else self.net.controllerNet.input trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred]) assert len(trainable_model.output_names) == 2 #combined_metrics = {trainable_model.output_names[1]: metrics} losses = [ lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer lambda y_true, y_pred: K.zeros_like(y_pred), # we only include this for the metrics ] rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0) trainable_model.compile(optimizer=rmsProp, loss=losses) self.trainable_model = trainable_model self.compiled = True def _update(self, stepCount): batches = self.memory.sample(self.nSamples, beta=beta_schedule.value(stepCount)) (stateVector, actionVector, rewardVector, nextStateVector, doneVector, importanceVector, idxVector) = batches stateVector = np.asarray(stateVector) nextStateVector = np.asarray(nextStateVector) q_values = self.net.controllerNet.predict(stateVector) assert q_values.shape == (self.nSamples, nb_Action) if self.enable_double_dqn: actions = np.argmax(q_values, axis = 1) assert actions.shape == (self.nSamples,) target_q_values = self.net.targetControllerNet.predict(nextStateVector) assert target_q_values.shape == (self.nSamples, nb_Action) q_batch = target_q_values[range(self.nSamples), actions] assert q_batch.shape == (self.nSamples,) else: target_q_values = self.net.targetControllerNet.predict(nextStateVector) q_batch = np.max(target_q_values, axis=1) assert q_batch.shape == (self.nSamples,) targets = np.zeros((self.nSamples, nb_Action)) dummy_targets = np.zeros((self.nSamples,)) masks = np.zeros((self.nSamples, nb_Action)) # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target targets accordingly, # but only for the affected output units (as given by action_batch). discounted_reward_batch = self.gamma * q_batch # Set discounted reward to zero for all states that were terminal. terminalBatch = np.array([1-float(done) for done in doneVector]) assert terminalBatch.shape == (self.nSamples,) discounted_reward_batch *= terminalBatch reward_batch = np.array(rewardVector) action_batch = np.array(actionVector) assert discounted_reward_batch.shape == reward_batch.shape Rs = reward_batch + discounted_reward_batch for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)): target[action] = R # update action with estimated accumulated reward dummy_targets[idx] = R mask[action] = 1. # enable loss for this specific action td_errors = targets[range(self.nSamples), action_batch] - q_values[range(self.nSamples), action_batch] new_priorities = np.abs(td_errors) + prioritized_replay_eps self.memory.update_priorities(idxVector, new_priorities) targets = np.array(targets).astype('float32') masks = np.array(masks).astype('float32') # Finally, perform a single update on the entire batch. We use a dummy target since # the actual loss is computed in a Lambda layer that needs more complex input. However, # it is still useful to know the actual target to compute metrics properly. ins = [stateVector] if type(self.net.controllerNet.input) is not list else stateVector if stepCount >= self.defaultRandomPlaySteps: loss = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets], sample_weight = [np.array(importanceVector), np.ones(self.nSamples)]) else: loss = [0.0,0.0,0.0] if stepCount > self.defaultRandomPlaySteps and stepCount % self.hard_update == 0: self.net.targetControllerNet.set_weights(self.net.controllerNet.get_weights()) return loss[1], np.mean(q_values), np.mean(np.abs(td_errors)) def update(self, stepCount): loss = self._update(stepCount) return loss def annealControllerEpsilon(self, stepCount, option_learned): if not self.randomPlay: if option_learned: self.controllerEpsilon = 0.0 else: if stepCount > self.defaultRandomPlaySteps: self.controllerEpsilon = self.exploration.value(stepCount - self.defaultRandomPlaySteps) #self.controllerEpsilon[goal] = exploration.value(stepCount - defaultRandomPlaySteps) def clear_memory(self, goal): self.learning_done = True ## Set the done learning flag del self.trainable_model del self.memory gpu = self.net.gpu del self.net gc.collect() rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0) with tf.device('/gpu:'+str(gpu)): self.simple_net = Sequential() self.simple_net.add(Conv2D(32, (8,8), strides = 4, activation = 'relu', padding = 'valid', input_shape = (84,84,4))) self.simple_net.add(Conv2D(64, (4,4), strides = 2, activation = 'relu', padding = 'valid')) self.simple_net.add(Conv2D(64, (3,3), strides = 1, activation = 'relu', padding = 'valid')) self.simple_net.add(Flatten()) self.simple_net.add(Dense(HIDDEN_NODES, activation = 'relu', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED))) self.simple_net.add(Dense(nb_Action, activation = 'linear', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED))) self.simple_net.compile(loss = 'mse', optimizer = rmsProp) self.simple_net.load_weights(recordFolder+'/policy_subgoal_' + str(goal) + '.h5') self.simple_net.reset_states()
def deep_q_learning(sess, env, q_estimator, target_estimator, num_episodes, experiment_dir, replay_buffer_size=500000, replay_buffer_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=500000, prioritized_replay_eps=1e-6): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The replay buffer replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=prioritized_replay_alpha) beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_transbag=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") # monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, env.action_space.n) # Populate the replay buffer with initial experience print("Populating replay buffer...") state = env.reset_test() for i in range(replay_buffer_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, next_state, done) if i % 1000 == 0: print("\r{} in {} ".format(i, replay_buffer_init_size), end="") sys.stdout.flush() state = env.reset_test() else: state = next_state # Record videos # Use the gym env Monitor wrapper # env = Monitor(env, # directory=monitor_path, # resume=True, # video_callable=lambda count: count % record_video_every ==0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset_test() loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, data_overflow = env.step(action) # Save transition to replay buffer replay_buffer.add(state, action, reward, next_state, done) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t stats.episode_transbag[i_episode] += data_overflow # Sample a minibatch from the replay buffer experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(total_t)) (states_batch, action_batch, reward_batch, next_states_batch, done_batch, weights_batch, batch_idxes) = experience # Calculate q values and targets (Double DQN) q_values_next = q_estimator.predict(sess, next_states_batch) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = target_estimator.predict( sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ discount_factor * q_values_next_target[np.arange(batch_size), best_actions] # Perform gradient descent update loss, td_error = q_estimator.update(sess, states_batch, action_batch, targets_batch, weights_batch) new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # if done: # break if t >= 1000: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") episode_summary.value.add( simple_value=stats.episode_transbag[i_episode], node_name="episode_transbag", tag="episode_transbag") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1], episode_transbag=stats.episode_transbag[:i_episode + 1]) #env.monitor.close() q_estimator.summary_writer.add_graph(sess.graph) return stats
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=100000, exploration_fraction=0.1, exploration_final_eps=0.1, train_freq=1, batch_size=64, print_freq=1, eval_freq=2500, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, csv_path="results.csv", method_type="baseline", **network_kwargs): """Train a deepr model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepr.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepr/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) #q_func = build_q_func(network, **network_kwargs) q_func = build_q_func(mlp(num_layers=4, num_hidden=64), **network_kwargs) #q_func = build_q_func(mlp(num_layers=2, num_hidden=64, activation=tf.nn.relu), **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * total_timesteps), #initial_p=1.0, initial_p=exploration_final_eps, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() eval_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) csvfile = open(csv_path, 'w', newline='') fieldnames = ['STEPS', 'REWARD'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for t in range(total_timesteps + 1): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: #update_eps = exploration.value(t) update_eps = exploration_final_eps update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action_mask = get_mask(env, method_type) a = act(np.array(obs)[None], unused_actions_neginf_mask=action_mask, update_eps=update_eps, **kwargs)[0] env_action = a reset = False new_obs, rew, done, _ = env.step(env_action) eval_rewards[-1] += rew action_mask_p = get_mask(env, method_type) # Shaping if method_type == 'shaping': ## look-ahead shaping ap = act(np.array(new_obs)[None], unused_actions_neginf_mask=action_mask_p, stochastic=False)[0] f = action_mask_p[ap] - action_mask[a] rew = rew + f # Store transition in the replay buffer. #replay_buffer.add(obs, a, rew, new_obs, float(done), action_mask_p) if method_type != 'shaping': replay_buffer.add(obs, a, rew, new_obs, float(done), np.zeros(env.action_space.n)) else: replay_buffer.add(obs, a, rew, new_obs, float(done), action_mask_p) obs = new_obs if t % eval_freq == 0: eval_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones, masks_tp1 = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights, masks_tp1) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_eval_reward = round(np.mean(eval_rewards[-1 - print_freq:-1]), 1) num_evals = len(eval_rewards) if t > 0 and t % eval_freq == 0 and print_freq is not None and t % ( print_freq * eval_freq) == 0: #if done and print_freq is not None and len(eval_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("evals", num_evals) logger.record_tabular("average reward in this eval", mean_eval_reward / (eval_freq)) logger.record_tabular("total reward in this eval", mean_eval_reward) logger.dump_tabular() writer.writerow({ "STEPS": t, "REWARD": mean_eval_reward / (eval_freq) }) csvfile.flush() if (checkpoint_freq is not None and t > learning_starts and num_evals > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_eval_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_eval_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_eval_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen + 1): for j in range(np.shape(obses_t)[1] - windowLen + 1): deicticObses_t.append(obses_t[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen + 1): for k in range(np.shape(obses_t)[2] - windowLen + 1): deicticObses_t.append(obses_t[i, j:j + windowLen, k:k + windowLen, :]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i, j:j + windowLen, k:k + windowLen, :]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array( deicticObses_tp1), np.array(deicticWeights) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 # convs=[(8,3,1)], # used for deictic TestRob3-v0 convs=[(16, 3, 1)], # used for deictic TestRob3-v0 # convs=[(4,3,1)], # used for deictic TestRob3-v0 # convs=[(16,3,1)], # used for deictic TestRob3-v0 # convs=[(8,2,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True) # model = models.mlp([6]) # parameters q_func = model lr = 1e-3 # lr=1e-4 # max_timesteps=100000 # max_timesteps=50000 max_timesteps = 20000 buffer_size = 50000 # exploration_fraction=0.1 exploration_fraction = 0.2 exploration_final_eps = 0.02 # exploration_final_eps=0.005 # exploration_final_eps=0.1 print_freq = 10 checkpoint_freq = 10000 learning_starts = 1000 gamma = .98 target_network_update_freq = 500 prioritized_replay = False # prioritized_replay=True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # batch_size=32 # train_freq=1 # batch_size=64 # train_freq=2 # batch_size=128 # train_freq=4 # batch_size=256 # train_freq=4 batch_size = 512 train_freq = 8 # deicticShape must be square. # These two parameters need to be consistent w/ each other. # deicticShape = (2,2,1) # num_deictic_patches=36 deicticShape = (3, 3, 1) num_deictic_patches = 36 # deicticShape = (4,4,1) # num_deictic_patches=25 # deicticShape = (5,5,1) # num_deictic_patches=16 # deicticShape = (6,6,1) # num_deictic_patches=9 # deicticShape = (7,7,1) # num_deictic_patches=4 # deicticShape = (8,8,1) # num_deictic_patches=1 def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size * 25, ) def make_match_ph(name): return U.BatchInput(matchShape, name=name) sess = U.make_session(num_cpu) sess.__enter__() # act, train, update_target, debug = build_graph.build_train( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( make_obs_ph=make_obs_ph, make_match_ph=make_match_ph, q_func=q_func, num_actions=env.action_space.n, batch_size=batch_size, num_deictic_patches=num_deictic_patches, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, double_q=False) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs, deicticShape[0]) qvalues = getq(np.array(deicticObs)) action = np.argmax(np.max(qvalues, 0)) selPatch = np.argmax(np.max(qvalues, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) # env.render() new_obs, rew, done, _ = env.step(action) # display state, action, nextstate if t > 20000: toDisplay = np.reshape(new_obs, (8, 8)) toDisplay[ np. int32(np.floor_divide(selPatch, np.sqrt(num_deictic_patches))), np.int32(np.remainder(selPatch, np.sqrt(num_deictic_patches)) )] = 50 print( "Current/next state. 50 denotes the upper left corner of the deictic patch." ) print(str(toDisplay)) # env.render() # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > 20000: print("q-values:") print(str(qvalues)) print("*** Episode over! ***\n\n") if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic( obses_t, actions, obses_tp1, weights, deicticShape[0]) obses_t_deic_fingerprints = [ np.reshape(obses_t_deic[i], [deicticShape[0] * deicticShape[1]]) for i in range(np.shape(obses_t_deic)[0]) ] _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints, axis=0, return_index=True, return_inverse=True) # matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)] # td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) # td_errors2, min_values_of_groups2, match_onehot2 = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) td_errors, min_values_of_groups, match_onehot = train( obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) if t > learning_starts and t % train_freq == 0: group_counts = np.sum(match_onehot, 1) print(str(min_values_of_groups[min_values_of_groups < 1000])) # print(str(min_values_of_groups2[min_values_of_groups2 < 1000])) print(str(group_counts[group_counts > 0])) # display one of most valuable deictic patches min_values_of_groups_trunc = min_values_of_groups[ min_values_of_groups < 1000] most_valuable_patches_idx = np.argmax( min_values_of_groups_trunc) most_valuable_patches = obses_t_deic[fingerprintMatch == most_valuable_patches_idx] print( str(np.reshape(most_valuable_patches[0], deicticShape[0:2]))) print( "value of most valuable patch: " + str(min_values_of_groups_trunc[most_valuable_patches_idx])) print("sum group counts: " + str(np.sum(group_counts))) num2avg = 20 rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def main(): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # Dictionary-based value function q_func_tabular = {} defaultQValue = np.ones(env.action_space.n) # Given an integer, return the corresponding boolean array def getBoolBits(state): return np.unpackbits(np.uint8(state), axis=1) == 1 # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func_tabular[x] if x in q_func_tabular else defaultQValue for x in keys ]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.1 for i in range(len(keys)): if keys[i] in q_func_tabular: q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[ keys[i]] + alpha * qCurrTargets[i] # q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] max_timesteps = 200000 exploration_fraction = 0.3 exploration_final_eps = 0.02 print_freq = 1 gamma = .98 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 buffer_size = 100 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 valueFunctionType = "TABULAR" # valueFunctionType = "DQN" episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Set up replay buffer prioritized_replay = True # prioritized_replay=False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) sess = U.make_session(num_cpu) sess.__enter__() state = env.reset() episode_rewards = [0.0] timerStart = time.time() for t in range(max_timesteps): # np.unpackbits(np.uint8(np.reshape(states_tp1,[batch_size,1])),axis=1) qCurr = getTabular(getBoolBits([[state]])) qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly # select action at random action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action nextState, rew, done, _ = env.step(action) replay_buffer.add(state, action, rew, nextState, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actions, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None qNext = getTabular( getBoolBits(np.reshape(states_tp1, [batch_size, 1]))) qNextmax = np.max(qNext, axis=1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTarget = getTabular( getBoolBits(np.reshape(states_t, [batch_size, 1]))) td_error = qCurrTarget[range(batch_size), actions] - targets qCurrTarget[range(batch_size), actions] = targets trainTabular(getBoolBits(np.reshape(states_t, [batch_size, 1])), qCurrTarget) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal state = np.copy(nextState)
class DDPG(DRL): """ Deep Deterministic Policy Gradient """ def __init__(self, env): super(DDPG, self).__init__() self.sess = K.get_session() self.env = env self.upper_bound = self.env.action_space.high[0] self.lower_bound = self.env.action_space.low[0] # update rate for target model. # for 2nd round training, use 0.000001 self.TAU = 0.00001 # learning rate for actor and critic # for 2nd round training, use 1e-5 self.actor_lr = 1e-4 self.critic_lr = 1e-4 # risk averse constant self.ra_c = 1.5 # actor: policy function # critic: Q functions; Q_ex, Q_ex2, and Q self.actor = self._build_actor(learning_rate=self.actor_lr) self.critic_Q_ex, self.critic_Q_ex2, self.critic_Q = self._build_critic( learning_rate=self.critic_lr) self.critic_Q.summary() # target networks for actor and three critics self.actor_hat = self._build_actor(learning_rate=self.actor_lr) self.actor_hat.set_weights(self.actor.get_weights()) self.critic_Q_ex_hat, self.critic_Q_ex2_hat, self.critic_Q_hat = self._build_critic( learning_rate=self.critic_lr) self.critic_Q_ex_hat.set_weights(self.critic_Q_ex.get_weights()) self.critic_Q_ex2_hat.set_weights(self.critic_Q_ex2.get_weights()) # epsilon of epsilon-greedy self.epsilon = 1.0 # discount rate for epsilon self.epsilon_decay = 0.99994 # self.epsilon_decay = 0.9994 # min epsilon of epsilon-greedy. self.epsilon_min = 0.1 # memory buffer for experience replay buffer_size = 600000 prioritized_replay_alpha = 0.6 self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) prioritized_replay_beta0 = 0.4 # need not be the same as training episode (see schedules.py) prioritized_replay_beta_iters = 50001 self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # for numerical stabiligy self.prioritized_replay_eps = 1e-6 self.t = None # memory sample batch size self.batch_size = 128 # may use for 2nd round training # self.policy_noise = 5 # self.noise_clip = 5 # gradient function self.get_critic_grad = self.critic_gradient() self.actor_optimizer() def load(self, tag=""): """load two Qs for test""" if tag == "": actor_file = "model/ddpg_actor.h5" critic_Q_ex_file = "model/ddpg_critic_Q_ex.h5" critic_Q_ex2_file = "model/ddpg_critic_Q_ex2.h5" else: actor_file = "model/ddpg_actor_" + tag + ".h5" critic_Q_ex_file = "model/ddpg_critic_Q_ex_" + tag + ".h5" critic_Q_ex2_file = "model/ddpg_critic_Q_ex2_" + tag + ".h5" if os.path.exists(actor_file): self.actor.load_weights(actor_file) self.actor_hat.load_weights(actor_file) if os.path.exists(critic_Q_ex_file): self.critic_Q_ex.load_weights(critic_Q_ex_file) self.critic_Q_ex_hat.load_weights(critic_Q_ex_file) if os.path.exists(critic_Q_ex2_file): self.critic_Q_ex2.load_weights(critic_Q_ex2_file) self.critic_Q_ex2_hat.load_weights(critic_Q_ex2_file) def _build_actor(self, learning_rate=1e-3): """basic NN model. """ inputs = Input(shape=(self.env.num_state, )) # bn after input x = BatchNormalization()(inputs) # bn after activation x = Dense(32, activation="relu")(x) x = BatchNormalization()(x) x = Dense(64, activation="relu")(x) x = BatchNormalization()(x) # no bn for output layer x = Dense(1, activation="sigmoid")(x) output = Lambda(lambda x: x * self.env.num_contract * 100)(x) model = Model(inputs=inputs, outputs=output) # compile the model using mse loss, but won't use mse to train model.compile(loss="mse", optimizer=Adam(learning_rate)) return model def _build_critic(self, learning_rate=1e-3): """basic NN model. """ # inputs s_inputs = Input(shape=(self.env.num_state, )) a_inputs = Input(shape=(1, )) # combine inputs x = concatenate([s_inputs, a_inputs]) # bn after input x = BatchNormalization()(x) # Q_ex network # bn after activation x1 = Dense(32, activation="relu")(x) x1 = BatchNormalization()(x1) x1 = Dense(64, activation="relu")(x1) x1 = BatchNormalization()(x1) # no bn for output layer output1 = Dense(1, activation="linear")(x1) model_Q_ex = Model(inputs=[s_inputs, a_inputs], outputs=output1) model_Q_ex.compile(loss="mse", optimizer=Adam(learning_rate)) # Q_ex2 network # bn after activation x2 = Dense(32, activation="relu")(x) x2 = BatchNormalization()(x2) # bn after activation x2 = Dense(64, activation="relu")(x2) x2 = BatchNormalization()(x2) # no bn for output layer output2 = Dense(1, activation="linear")(x2) model_Q_ex2 = Model(inputs=[s_inputs, a_inputs], outputs=output2) model_Q_ex2.compile(loss="mse", optimizer=Adam(learning_rate)) # Q output3 = Lambda( lambda o: o[0] - self.ra_c * K.sqrt(K.max(o[1] - o[0] * o[0], 0)))( [output1, output2]) model_Q = Model(inputs=[s_inputs, a_inputs], outputs=output3) model_Q.compile(loss="mse", optimizer=Adam(learning_rate)) return model_Q_ex, model_Q_ex2, model_Q def actor_optimizer(self): """actor_optimizer. Returns: function, opt function for actor. """ self.ainput = self.actor.input aoutput = self.actor.output trainable_weights = self.actor.trainable_weights self.action_gradient = tf.placeholder(tf.float32, shape=(None, 1)) # tf.gradients calculates dy/dx with a initial gradients for y # action_gradient is dq/da, so this is dq/da * da/dparams params_grad = tf.gradients(aoutput, trainable_weights, -self.action_gradient) grads = zip(params_grad, trainable_weights) self.opt = tf.train.AdamOptimizer(self.actor_lr).apply_gradients(grads) self.sess.run(tf.global_variables_initializer()) def critic_gradient(self): """get critic gradient function. Returns: function, gradient function for critic. """ cinput = self.critic_Q.input coutput = self.critic_Q.output # compute the gradient of the action with q value, dq/da. action_grads = K.gradients(coutput, cinput[1]) return K.function([cinput[0], cinput[1]], action_grads) def egreedy_action(self, X): """get actor action with ou noise. Arguments: X: state value. """ # do the epsilon greedy way; not using OU if np.random.rand() <= self.epsilon: action = env.action_space.sample() # may use for 2nd round training # action = self.actor.predict(X)[0][0] # noise = np.clip(np.random.normal(0, self.policy_noise), -self.noise_clip, self.noise_clip) # action = np.clip(action + noise, 0, self.env.num_contract * 100) else: action = self.actor.predict(X)[0][0] return action, None, None def update_epsilon(self): """update epsilon """ if self.epsilon >= self.epsilon_min: self.epsilon *= self.epsilon_decay def remember(self, state, action, reward, next_state, done): """add data to experience replay. Arguments: state: observation action: action reward: reward next_state: next_observation done: if game is done. """ self.replay_buffer.add(state, action, reward, next_state, done) def process_batch(self, batch_size): """process batch data Arguments: batch: batch size Returns: states: batch of states actions: batch of actions target_q_ex, target_q_ex2: batch of targets; weights: priority weights """ # prioritized sample from experience replay buffer experience = self.replay_buffer.sample(batch_size, beta=self.beta_schedule.value( self.t)) (states, actions, rewards, next_states, dones, weights, batch_idxes) = experience actions = actions.reshape(-1, 1) rewards = rewards.reshape(-1, 1) dones = dones.reshape(-1, 1) # get next_actions next_actions = self.actor_hat.predict(next_states) # prepare targets for Q_ex and Q_ex2 training q_ex_next = self.critic_Q_ex_hat.predict([next_states, next_actions]) q_ex2_next = self.critic_Q_ex2_hat.predict([next_states, next_actions]) target_q_ex = rewards + (1 - dones) * q_ex_next target_q_ex2 = rewards**2 + (1 - dones) * (2 * rewards * q_ex_next + q_ex2_next) # use Q2 TD error as priority weight td_errors = self.critic_Q_ex2.predict([states, actions]) - target_q_ex2 new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps).flatten() self.replay_buffer.update_priorities(batch_idxes, new_priorities) return states, actions, target_q_ex, target_q_ex2, weights def update_model(self, X1, X2, y1, y2, weights): """update ddpg model. Arguments: X1: states X2: actions y1: target for Q_ex y2: target for Q_ex2 weights: priority weights Returns: loss_ex: critic Q_ex loss loss_ex2: critic Q_ex2 loss """ # flatten to prepare for training with weights weights = weights.flatten() # default batch size is 32 loss_ex = self.critic_Q_ex.fit([X1, X2], y1, sample_weight=weights, verbose=0) loss_ex = np.mean(loss_ex.history['loss']) # default batch size is 32 loss_ex2 = self.critic_Q_ex2.fit([X1, X2], y2, sample_weight=weights, verbose=0) loss_ex2 = np.mean(loss_ex2.history['loss']) X3 = self.actor.predict(X1) a_grads = np.array(self.get_critic_grad([X1, X3]))[0] self.sess.run(self.opt, feed_dict={ self.ainput: X1, self.action_gradient: a_grads }) return loss_ex, loss_ex2 def update_target_model(self): """soft update target model. """ critic_Q_ex_weights = self.critic_Q_ex.get_weights() critic_Q_ex2_weights = self.critic_Q_ex2.get_weights() actor_weights = self.actor.get_weights() critic_Q_ex_hat_weights = self.critic_Q_ex_hat.get_weights() critic_Q_ex2_hat_weights = self.critic_Q_ex2_hat.get_weights() actor_hat_weights = self.actor_hat.get_weights() for i in range(len(critic_Q_ex_weights)): critic_Q_ex_hat_weights[i] = self.TAU * critic_Q_ex_weights[i] + ( 1 - self.TAU) * critic_Q_ex_hat_weights[i] for i in range(len(critic_Q_ex2_weights)): critic_Q_ex2_hat_weights[i] = self.TAU * critic_Q_ex2_weights[ i] + (1 - self.TAU) * critic_Q_ex2_hat_weights[i] for i in range(len(actor_weights)): actor_hat_weights[i] = self.TAU * actor_weights[i] + ( 1 - self.TAU) * actor_hat_weights[i] self.critic_Q_ex_hat.set_weights(critic_Q_ex_hat_weights) self.critic_Q_ex2_hat.set_weights(critic_Q_ex2_hat_weights) self.actor_hat.set_weights(actor_hat_weights) def train(self, episode): """training Arguments: episode: total episodes to run Returns: history: training history """ # some statistics history = { "episode": [], "episode_w_T": [], "loss_ex": [], "loss_ex2": [] } for i in range(episode): observation = self.env.reset() done = False # for recording purpose y_action = np.empty(0, dtype=int) reward_store = np.empty(0) self.t = i # steps in an episode while not done: # prepare state x = np.array(observation).reshape(1, -1) # chocie action from epsilon-greedy. action, _, _ = self.egreedy_action(x) # one step observation, reward, done, info = self.env.step(action) # record action and reward y_action = np.append(y_action, action) reward_store = np.append(reward_store, reward) # store to memory self.remember(x[0], action, reward, observation, done) if len(self.replay_buffer) > self.batch_size: # draw from memory X1, X2, y_ex, y_ex2, weights = self.process_batch( self.batch_size) # update model loss_ex, loss_ex2 = self.update_model( X1, X2, y_ex, y_ex2, weights) # soft update target self.update_target_model() # reduce epsilon per episode self.update_epsilon() # print/store some statistics every 1000 episodes if i % 1000 == 0 and i != 0: # may want to print/store some statistics every 100 episodes # if i % 100 == 0 and i >= 1000: # get w_T for statistics w_T = np.sum(reward_store) history["episode"].append(i) history["episode_w_T"].append(w_T) history["loss_ex"].append(loss_ex) history["loss_ex2"].append(loss_ex2) path_row = info["path_row"] print(info) print( "episode: {} | episode final wealth: {:.3f} | loss_ex: {:.3f} | loss_ex2: {:.3f} | epsilon:{:.2f}" .format(i, w_T, loss_ex, loss_ex2, self.epsilon)) with np.printoptions(precision=2, suppress=True): print("episode: {} | rewards {}".format(i, reward_store)) print("episode: {} | actions taken {}".format(i, y_action)) print("episode: {} | deltas {}".format( i, self.env.delta_path[path_row] * 100)) print("episode: {} | stock price {}".format( i, self.env.path[path_row])) print("episode: {} | option price {}\n".format( i, self.env.option_price_path[path_row] * 100)) # may want to save model every 100 episode # if i % 100 == 0: # self.actor.save_weights("model/ddpg_actor_" + str(int(i/100)) + ".h5") # self.critic_Q_ex.save_weights("model/ddpg_critic_Q_ex_" + str(int(i/100)) + ".h5") # self.critic_Q_ex2.save_weights("model/ddpg_critic_Q_ex2_" + str(int(i/100)) + ".h5") self.actor.save_weights("model/ddpg_actor_" + str(int(i / 1000)) + ".h5") self.critic_Q_ex.save_weights("model/ddpg_critic_Q_ex_" + str(int(i / 1000)) + ".h5") self.critic_Q_ex2.save_weights("model/ddpg_critic_Q_ex2_" + str(int(i / 1000)) + ".h5") # save weights once training is done self.actor.save_weights("model/ddpg_actor.h5") self.critic_Q_ex.save_weights("model/ddpg_critic_Q_ex.h5") self.critic_Q_ex2.save_weights("model/ddpg_critic_Q_ex2.h5") return history
def dist_learn(env, q_dist_func, num_atoms=51, V_max=10, lr=25e-5, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.008, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=2000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=1, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.single_threaded_session() sess.__enter__() def make_obs_ph(name): print name return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = build_dist_train( make_obs_ph=make_obs_ph, dist_func=q_dist_func, num_actions=env.action_space.n, num_atoms=num_atoms, V_max=V_max, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) # act, train, update_target, debug = build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=env.action_space.n, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10 # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_dist_func': q_dist_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") print model_file # mkdir_p(os.path.dirname(model_file)) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: # print "CCCC" obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # print "Come1" # print np.shape(obses_t), np.shape(actions), np.shape(rewards), np.shape(obses_tp1), np.shape(dones) td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # print "Loss : {}".format(td_errors) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print "steps : {}".format(t) print "episodes : {}".format(num_episodes) print "mean 100 episode reward: {}".format(mean_100ep_reward) # print "mean 100 episode reward".format(mean_100ep_reward) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and t % checkpoint_freq == 0): print "==========================" print "Error: {}".format(td_errors) if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: print "Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward) # logger.log("Saving model due to mean reward increase: {} -> {}".format( # saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: print "Restored model with mean reward: {}".format( saved_mean_reward) # logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
class Agent(): def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, hidden_layers_size=[64, 32], update_every=4, update_target_very=12, alpha=0.6, beta=0.4, beta_increment=1e-3, prior_eps=1e-6): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed alpha: determines how much prioritization is used beta: determines how much importance smapling is used beta_increment: linear increment of beta prior_eps : guarantees every transition can be sampled """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.update_target_very = update_target_very self.alpha = 0.6 self.beta = 0.4 self.beta_increment = beta_increment self.prior_eps = prior_eps # Q-Network self.qnetwork_local = DQNNetwork( state_size, action_size, seed, hidden_layers_size=hidden_layers_size).to(device) self.qnetwork_target = DQNNetwork( state_size, action_size, seed, hidden_layers_size=hidden_layers_size).to(device) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = PrioritizedReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.t_target_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) #linear increase of beta self.beta = min(self.beta + self.beta_increment, 1.0) # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: # Learn every UPDATE_EVERY time steps. self.t_step = self.t_step + 1 if self.t_step % self.update_every == 0: experiences = self.memory.sample(self.beta) # print(experiences[6]) self.learn(experiences, self.gamma) """ a implementation of fixed Q-Targets """ if self.t_step % self.update_target_very == 0: self.update_target_Q() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, method='DQN'): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, indices = experiences #already to(device) ## TODO: compute and minimize the loss self.optimizer.zero_grad() if method == 'DQN': target_values = self.qnetwork_target.forward(next_states) #Q Learning with the max q(next_state, a) accumulated_rewards = rewards.squeeze( 1) + gamma * target_values.max(dim=1)[0] else: max_actions = self.qnetwork_local.forward(next_states) max_actions = max_actions.argmax(dim=1).unsqueeze(1) target_values = self.qnetwork_target.forward(next_states) evaluate_target_values = target_values.gather(1, max_actions) accumulated_rewards = rewards.squeeze( 1) + gamma * evaluate_target_values.squeeze(1) # get the old q(current_state, action) old_values = self.qnetwork_local.forward(states).gather( 1, actions).squeeze(1) #detect done done_index = dones.argmax().item() if dones[done_index].item(): accumulated_rewards[ done_index] = 0.0 #should not be rewards[done_index], which is acturally -100 elementary_loss = (accumulated_rewards - old_values).pow(2) loss = (elementary_loss * weights.squeeze(1)).mean() loss.backward() self.optimizer.step() #update transition priority loss_for_prior = elementary_loss.detach().cpu().numpy() loss_for_prior = loss_for_prior + self.prior_eps self.memory.update_priority(indices, loss_for_prior) # ------------------- update target network ------------------- # # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def criterion(self, accumulated_rewords, old_values): return (accumulated_rewords - old_values).pow(2).mean() def update_target_Q(self): # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: # @todo: when instantiating two of these, it raises an Exception, because it tries to redefine # @todo: the scopes or variables (the names are already taken) # @todo: FIX THIS !!! """ We don't use the bundle entropy method to optimize wrt actions, but rather plain SGD (or rather Adam) """ def __init__(self, dimO, dimA, beta, layers_dim, finalize_graph=True): """ :param finalize_graph: if you want to restore a model, using .restore(), set this param to False """ self.dimA = dimA self.dimO = dimO self.beta = beta self.layers_dim = layers_dim tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate self.opt = self.adam self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, FLAGS.alpha) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True)) self.noise = np.zeros(self.dimA) per_weights = tf.placeholder(tf.float32, [None], 'per_weights') obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") with tf.variable_scope('q'): negQ = self.negQ(obs, act) q = -negQ act_grad, = tf.gradients(negQ, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): negQ_target = self.negQ(obs_target, act_target) act_target_grad, = tf.gradients(negQ_target, act_target) q_target = -negQ_target y = tf.where(term_target, rew, rew + discount * q_target) y = tf.maximum(q - 1., y) y = tf.minimum(q + 1., y) y = tf.stop_gradient(y) print('y shape', y.get_shape()) print('q shape', q.get_shape()) td_error = q - y print('per weights shape', per_weights.get_shape()) print('multi td error^2 per weights shape', tf.multiply(tf.square(td_error), per_weights).get_shape()) ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weights), 0) print('ms td error shape', ms_td_error.get_shape()) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + \ l2norm * tf.reduce_sum(regLosses) + \ FLAGS.alpha_beyond * tf.reduce_sum( tf.where( q > FLAGS.RMAX, tf.square(q - FLAGS.RMAX), tf.zeros((FLAGS.bsize,))) + tf.where( q < FLAGS.RMIN, tf.square(q - FLAGS.RMIN), tf.zeros((FLAGS.bsize,))), 0 ) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau * (theta_target_i - theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) tf.summary.scalar('Qvalue (batch avg)', tf.reduce_mean(q)) tf.summary.scalar('Qvalue (batch max)', tf.reduce_max(q)) tf.summary.scalar('Qvalue (batch min)', tf.reduce_min(q)) tf.summary.scalar('Q targets (batch avg)', tf.reduce_mean(q_target)) tf.summary.scalar('Q targets (batch min)', tf.reduce_min(q_target)) tf.summary.scalar('Q targets (batch max)', tf.reduce_max(q_target)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('td error', tf.reduce_mean(tf.abs(td_error))) tf.summary.scalar('reward', tf.reduce_mean(rew)) tf.summary.scalar('chosen actions', tf.reduce_mean(act)) tf.summary.scalar('maximizing action (batch avg)', tf.reduce_mean(act_target)) tf.summary.scalar('maximizing action (batch max)', tf.reduce_max(act_target)) tf.summary.scalar('maximizing action (batch min)', tf.reduce_min(act_target)) merged = tf.summary.merge_all() # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weights], [optimize_q, update_target, loss_q, tf.abs(td_error), q, q_target], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.global_variables_initializer()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) if finalize_graph: self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def adam(self, func, obs, plot=False): """Optimizer to find the greedy action""" # if npr.random() < 1./20: # plot = True b1 = 0.9 b2 = 0.999 lam = 0.5 eps = 1e-8 alpha = 0.01 nBatch = obs.shape[0] act = np.zeros((nBatch, self.dimA)) m = np.zeros_like(act) v = np.zeros_like(act) b1t, b2t = 1., 1. act_best, a_diff, f_best = [None] * 3 hist = {'act': [], 'f': [], 'g': []} for i in range(1000): f, g = func(obs, act) if plot: hist['act'].append(act.copy()) hist['f'].append(f) hist['g'].append(g) if i == 0: act_best = act.copy() f_best = f.copy() else: prev_act_best = act_best.copy() I = (f < f_best) act_best[I] = act[I] f_best[I] = f[I] a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1)) a_diff = a_diff_i if a_diff is None \ else lam * a_diff + (1. - lam) * a_diff_i # print(a_diff_i, a_diff, np.sum(f)) if a_diff < 1e-3 and i > 5: if plot: self.adam_plot(func, obs, hist) return act_best m = b1 * m + (1. - b1) * g v = b2 * v + (1. - b2) * (g * g) b1t *= b1 b2t *= b2 mhat = m / (1. - b1t) vhat = v / (1. - b2t) act -= alpha * mhat / (np.sqrt(v) + eps) act = np.clip(act, FLAGS.a_min + 1e-8, FLAGS.a_max - 1e-8) print(' + Warning: Adam did not converge.') if plot: self.adam_plot(func, obs, hist) return act_best def adam_plot(self, func, obs, hist): hist['act'] = np.array(hist['act']).T hist['f'] = np.array(hist['f']).T hist['g'] = np.array(hist['g']).T if self.dimA == 1: xs = np.linspace(-1. + 1e-8, 1. - 1e-8, 100) ys = [func(obs[[0], :], [[xi]])[0] for xi in xs] fig = plt.figure() plt.plot(xs, ys) plt.plot(hist['act'][0, 0, :], hist['f'][0, :], label='Adam') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) elif self.dimA == 2: assert (False) else: xs = npr.uniform(-1., 1., (5000, self.dimA)) ys = np.array([func(obs[[0], :], [xi])[0] for xi in xs]) epi = np.hstack((xs, ys)) pca = PCA(n_components=2).fit(epi) W = pca.components_[:, :-1] xs_proj = xs.dot(W.T) fig = plt.figure() X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100) Z = griddata(xs_proj[:, 0], xs_proj[:, 1], ys.ravel(), X, Y, interp='linear') plt.contourf(X, Y, Z, 15) plt.colorbar() adam_x = hist['act'][:, 0, :].T adam_x = adam_x.dot(W.T) plt.plot(adam_x[:, 0], adam_x[:, 1], label='Adam', color='k') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) def reset(self, obs): self.noise = np.zeros(self.dimA) self.observation = obs # initial observation def act(self, test=False): """ Greedily choose action There is noise during training """ with self.sess.as_default(): obs = np.expand_dims(self.observation, axis=0) f = self._fg tflearn.is_training(False) action = self.opt(f, obs) tflearn.is_training(not test) if not test: # sig = (self.t < 40000) * (self.t * (FLAGS.ousigma_end - FLAGS.ousigma_start) / 40000 + FLAGS.ousigma_start) + (self.t >= 40000) * FLAGS.ousigma_end # self.noise = sig * npr.randn(self.dimA) self.noise -= FLAGS.outheta * self.noise - FLAGS.ousigma * npr.randn(self.dimA) action += self.noise action = np.clip(action, FLAGS.a_min, FLAGS.a_max) self.action = np.atleast_1d(np.squeeze(action, axis=0)) return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.rm.add(*(obs1, self.action, rew, obs2, term)) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): self.t += 1 beta = self.beta(self.t) with self.sess.as_default(): obs, act, rew, ob2, term2, w, idx = self.rm.sample(FLAGS.bsize, beta) rew, term2, w = rew.squeeze(), term2.squeeze(), w.squeeze() # fix dimensions # w = np.ones(w.shape) # no prioritization f = self._fg_target tflearn.is_training(False) act2 = self.opt(f, ob2) tflearn.is_training(True) _, _, loss, td_error, q, q_target = self._train(obs, act, rew, ob2, act2, term2, w, log=FLAGS.summary, global_step=self.t) self.sess.run(self.proj) # keep some weights positive # self.rm.update_priorities(idx, np.array(td_error.shape[0] * [1.])) # no prioritization self.rm.update_priorities(idx, td_error + 1e-2) return loss, td_error, q, q_target def negQ(self, x, y, reuse=False): """Architecture of the neural network""" print('x shape', x.get_shape()) print('y shape', y.get_shape()) szs = self.layers_dim assert (len(szs) >= 1) fc = tflearn.fully_connected bn = tflearn.batch_normalization lrelu = tflearn.activations.leaky_relu if reuse: tf.get_variable_scope().reuse_variables() nLayers = len(szs) us = [] zs = [] z_zs = [] z_ys = [] z_us = [] reg = 'L2' prevU = x for i in range(nLayers): with tf.variable_scope('u' + str(i), reuse=reuse) as s: u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg) if i < nLayers - 1: u = tf.nn.relu(u) if FLAGS.icnn_bn: u = bn(u, reuse=reuse, scope=s, name='bn') variable_summaries(u, suffix='u{}'.format(i)) us.append(u) prevU = u prevU, prevZ = x, y for i in range(nLayers + 1): sz = szs[i] if i < nLayers else 1 z_add = [] if i > 0: with tf.variable_scope('z{}_zu_u'.format(i), reuse=reuse) as s: zu_u = fc(prevU, szs[i - 1], reuse=reuse, scope=s, activation='relu', bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(zu_u, suffix='zu_u{}'.format(i)) with tf.variable_scope('z{}_zu_proj'.format(i), reuse=reuse) as s: z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) variable_summaries(z_zu, suffix='z_zu{}'.format(i)) z_zs.append(z_zu) z_add.append(z_zu) with tf.variable_scope('z{}_yu_u'.format(i), reuse=reuse) as s: yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(yu_u, suffix='yu_u{}'.format(i)) with tf.variable_scope('z{}_yu'.format(i), reuse=reuse) as s: z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) z_ys.append(z_yu) variable_summaries(z_yu, suffix='z_yu{}'.format(i)) z_add.append(z_yu) with tf.variable_scope('z{}_u'.format(i), reuse=reuse) as s: z_u = fc(prevU, sz, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(0.)) variable_summaries(z_u, suffix='z_u{}'.format(i)) z_us.append(z_u) z_add.append(z_u) z = tf.add_n(z_add) variable_summaries(z, suffix='z{}_preact'.format(i)) if i < nLayers: # z = tf.nn.relu(z) z = lrelu(z, alpha=FLAGS.lrelu) variable_summaries(z, suffix='z{}_act'.format(i)) zs.append(z) prevU = us[i] if i < nLayers else None prevZ = z print('z shape', z.get_shape()) z = tf.reshape(z, [-1], name='energies') return z def save(self, path): self.saver.save(self.sess, path) def restore(self, filename): """ IMPORTANT: Filename should be the filepath to the 4 following files: - 50314.index - 50314.meta - 50314.data-00000-of-00001 - checkpoint Note that it shouldn't include any extension. In this case, it would therefore be `tensorboard/models/50314` Note that it is `50314` because I used the global training step as a filename to save model !!!! BESIDES YOU SHOULD HAVE INSTANTIATED THE AGENT WITH `finalize_graph=False` !!!! """ self.saver = tf.train.import_meta_graph(filename+'.meta') self.saver.restore(self.sess, filename) self.sess.graph.finalize() def __del__(self): self.sess.close()
def learn(env, args, callback=None): dist_deepq = DistDeepQ(env, args) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer( args.buffer_size, alpha=args.prioritized_replay_alpha) args.prioritized_replay_beta_iters = args.max_timesteps beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, initial_p=args.prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(args.buffer_size) beta_schedule = None exploration = LinearSchedule(schedule_timesteps=int( args.exploration_fraction * args.max_timesteps), initial_p=1.0, final_p=args.exploration_final_eps) # dist_deepq.sample_noise() dist_deepq.update_target() episode_rewards = [0.0] saved_mean_reward = None ob = env.reset() for t in range(args.max_timesteps): if callback is not None: if callback(locals(), globals()): break update_eps = exploration.value(t) action = dist_deepq.act_distributional(ob, update_eps) # action = dist_deepq.act_noisy_distributional(ob) new_ob, rew, done, _ = env.step(action) replay_buffer.add(ob, action, rew, new_ob, float(done)) ob = new_ob episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > args.learning_starts and t % args.train_freq == 0: if args.prioritized_replay: experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t)) (obs, actions, rewards, obs_next, dones, weights, batch_idxes) = experience else: obs, actions, rewards, obs_next, dones = replay_buffer.sample( args.batch_size) weights, batch_idxes = np.ones_like(rewards), None kl_errors = dist_deepq.distributional_update( obs, actions, rewards, obs_next, dones, weights) # dist_deepq.sample_noise() if args.prioritized_replay: replay_buffer.update_priorities(batch_idxes, kl_errors) if t > args.learning_starts and t % args.target_network_update_freq == 0: dist_deepq.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and args.print_freq is not None and len( episode_rewards) % args.print_freq == 0: print('steps {} episodes {} mean reward {}'.format( t, num_episodes, mean_100ep_reward)) '''
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") env = gym.make("FrozenLake8x8nohole-v0") # robShape = (2,) # robShape = (3,) # robShape = (200,) # robShape = (16,) robShape = (64,) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(robShape, name=name) # # these params are specific to mountaincar # def getOneHotObs(obs): # obsFraction = (obs[0] + 1.2) / 1.8 # idx1 = np.int32(np.trunc(obsFraction*100)) # obsFraction = (obs[1] + 0.07) / 0.14 # idx2 = np.int32(np.trunc(obsFraction*100)) # ident = np.identity(100) # return np.r_[ident[idx1,:],ident[idx2,:]] # these params are specific to frozenlake def getOneHotObs(obs): # ident = np.identity(16) ident = np.identity(64) return ident[obs,:] model = models.mlp([32]) # model = models.mlp([64]) # model = models.mlp([64], layer_norm=True) # model = models.mlp([16, 16]) # parameters q_func=model lr=1e-3 # max_timesteps=100000 max_timesteps=50000 # max_timesteps=10000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1.0 target_network_update_freq=500 # prioritized_replay=False prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() obs = getOneHotObs(obs) # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) new_obs = getOneHotObs(new_obs) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() obs = getOneHotObs(obs) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def train( env_name, file_name, network_type, env_seed = None, seed = None, buffer_size = int(1e5), alpha = 0.6, batch_size = 32, reward_min = -1.0, reward_max = 1.0, reward_discount = 0.99, epsilon_start = 1.0, epsilon_end = 0.05, epsilon_decay_step = int(1e6), beta_start = 0.4, beta_end = 1.0, beta_decay_step = int(1e7), lrs = [5e-5, 5e-6], lr_cutoff_steps = [int(8e6)], total_steps = int(1e7), initial_buffer_size = int(1e5), target_network_update_step = 1000, training_step = 4, last_k_episodes = 100, print_frequency = 10, save_frequency = 0.01 ): # Create folders. if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) if not os.path.isdir(CSV_DIR): os.makedirs(CSV_DIR) # Create environment. env = make_atari(env_name) if env_seed is not None: env.seed(env_seed) obs_shape = env.observation_space.shape num_action = env.action_space.n if seed is not None: np.random.seed(seed) tf.set_random_seed(seed) # Initialize step schedules. epsilon = LinearSchedule(start = epsilon_start, end = epsilon_end, decay_step = epsilon_decay_step) beta = LinearSchedule(start = epsilon_start, end = epsilon_end, decay_step = epsilon_decay_step) learning_rate = StaircaseSchedule(values = lrs, cutoff_steps = lr_cutoff_steps) # Initialize replay buffer. replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha = alpha) # Build model graph. model_graph = ModelGraph(obs_shape, num_action, network_type = network_type, gamma = reward_discount) # Initialize session and variables. sess = tf.InteractiveSession() model_graph.initialize_variables() model_graph.update_target_network() start_time = time.time() list_step = [] list_episodic_reward = [] list_mean_episodic_reward = [] episodic_reward = 0 highest_episodic_reward = None obs = env.reset() for step in range(1, total_steps): # Synchronize the target network periodically (target network <- main network). if step > initial_buffer_size and step % target_network_update_step == 0: model_graph.update_target_network() # Sample action with epsilon-greedy policy. action = model_graph.epsilon_act(np.expand_dims(obs, axis = 0), epsilon.get_value(step))[0] # Interact with the environment. obs_next, reward, done, _ = env.step(action) episodic_reward += reward if done: obs_next = env.reset() # Record episodic reward. list_step.append(step) list_episodic_reward.append(episodic_reward) mean_episodic_reward = np.round(np.mean(list_episodic_reward[-last_k_episodes:]), 2) list_mean_episodic_reward.append(mean_episodic_reward) if len(list_episodic_reward) % print_frequency == 0: print("Episode ", str(len(list_episodic_reward)), ": step = ", step, ", mean reward = ", mean_episodic_reward, ".", sep = "") # Save the network when the mean episodic reward breaks the record. if step >= initial_buffer_size and len(list_episodic_reward) >= last_k_episodes: if highest_episodic_reward is None or mean_episodic_reward > highest_episodic_reward: if np.random.uniform() < save_frequency: model_graph.save(SAVE_DIR + file_name) print("Save the network as mean episodic reward increases from ", highest_episodic_reward, " to ", mean_episodic_reward, ".", sep = "") highest_episodic_reward = mean_episodic_reward episodic_reward = 0 # Store data. data = (obs, action, reward, done, obs_next) replay_buffer.append(data) # Update observation. obs = obs_next # Train the agent. if step > initial_buffer_size and step % training_step == 0: # Sample training data from the replay buffer. batch_index, batch_data, batch_weights = replay_buffer.sample(batch_size, beta.get_value(step)) batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \ [np.array([batch_data[j][i] for j in range(batch_size)]) for i in range(len(batch_data[0]))] # Clip the reward. batch_reward = np.clip(batch_reward, reward_min, reward_max) # One train step. td_error = model_graph.train(batch_obs, batch_action, batch_reward, batch_done, batch_obs_next, batch_weights, learning_rate.get_value(step)) # Update priority for the sampled data. replay_buffer.update_priorities(batch_index, td_error) sess.close() tf.contrib.keras.backend.clear_session() total_time = int(time.time() - start_time) print("Training finished in ", total_time, " s.", sep = "") # Close the environment. env.close() # Store data in a csv file. record = pd.DataFrame({"Step": list_step, "Mean Episodic Reward": list_mean_episodic_reward}) record.to_csv(CSV_DIR + file_name + ".csv", sep = ",", index = False)