def __init__(self, env, gamma, target_update_freq, num_burn_in, train_freq, batch_size, mode, log_parent_dir = '/data/datasets/ratneshm/deeprl_hw2/q5'): self.env_string = env self.env = gym.make(env) self.num_actions = self.env.action_space.n self.gamma = gamma self.target_update_freq = target_update_freq self.num_burn_in = num_burn_in self.train_freq = train_freq self.batch_size = batch_size self.iter_ctr = 0 self.eval_episode_ctr = 0 self.preprocessor = preprocessors.PreprocessorSequence() # loggers self.qavg_list = np.array([0]) self.reward_list = [] self.loss_log = [] self.loss_last = None self.mode = mode self.log_parent_dir = log_parent_dir self.make_log_dir() # makes empty dir and logfiles based on current timestamp inside self.log_parent_dir
def __init__(self, env, gamma, target_update_freq, num_burn_in, train_freq, batch_size, mode, eval_dir=None, resume_dir=None, log_parent_dir='/home/vaibhav/madratman/logs/project/dqn'): self.env_string = env self.env = gym.make(env) self.num_actions = self.env.action_space.n self.gamma = gamma self.target_update_freq = target_update_freq self.num_burn_in = num_burn_in self.train_freq = train_freq self.batch_size = batch_size self.train_iter_ctr = 0 self.eval_episode_ctr = 0 self.preprocessor = preprocessors.PreprocessorSequence() self.eval_dir = eval_dir self.resume_dir = resume_dir self.is_eval = False # eval only mode self.is_resume = False if eval_dir is not None: self.is_eval = True if resume_dir is not None: self.is_resume = True # loggers self.qavg_list = np.array([0]) self.reward_list = [] self.loss_log = [] self.loss_last = None self.mode = mode self.log_parent_dir = log_parent_dir self.make_log_dir( ) # makes empty dir and logfiles based on current timestamp inside self.log_parent_dir self.start_time = timeit.default_timer() # printing self.RED = '\033[91m' self.BOLD = '\033[1m' self.ENDC = '\033[0m' self.LINE = "%s%s##############################################################################%s" % ( self.RED, self.BOLD, self.ENDC)
def __init__(self, env, gamma, target_update_freq, num_burn_in, train_freq, batch_size, mode, resume_dir, log_parent_dir='/home/vaibhav/madratman/logs/project/dqn'): self.env_string = env self.env = gym.make(env) self.num_actions = self.env.action_space.n self.gamma = gamma self.target_update_freq = target_update_freq self.num_burn_in = num_burn_in self.train_freq = train_freq self.batch_size = batch_size if resume_dir is None: self.train_iter_ctr = 0 self.eval_episode_ctr = 0 self.preprocessor = preprocessors.PreprocessorSequence() # loggers self.qavg_list = np.array([0]) self.reward_list = [] self.loss_log = [] self.loss_last = None self.mode = mode self.log_parent_dir = log_parent_dir # self.make_log_dir() # makes empty dir and logfiles based on current timestamp inside self.log_parent_dir if resume_dir is not None: print "resuming from ", resume_dir self.resume_from_log_dir(resume_dir) self.is_resume = True print "self.is_resume", self.is_resume
def evaluate(self, num_episodes, max_episode_length=None, gen_video=False): self.compile(self.is_resume) evaluation_policy = GreedyPolicy() eval_preprocessor = preprocessors.PreprocessorSequence() # env_valid = gym.make(self.env_string) iter_ctr_valid = 0 Q_sum = 0 eval_episode_ctr_valid = 0 total_reward_all_episodes = [] # https://github.com/openai/gym/blob/master/gym/wrappers/monitoring.py video_callable takes function as arg. so we hack with true lambda # https://github.com/openai/gym/issues/494 if gen_video: video_dir = os.path.join(self.log_dir, 'gym_monitor', str(self.train_iter_ctr).zfill(7)) os.makedirs(video_dir) env_valid = wrappers.Monitor(env_valid, video_dir, video_callable=lambda x: True, mode='evaluation') RED = '\033[91m' BOLD = '\033[1m' ENDC = '\033[0m' LINE = "%s%s##############################################################################%s" % ( RED, BOLD, ENDC) while eval_episode_ctr_valid < num_episodes: state = self.env.reset() eval_preprocessor.reset_history_memory() num_timesteps_in_curr_episode = 0 total_reward_curr_episode = 0.0 while num_timesteps_in_curr_episode < max_episode_length: num_timesteps_in_curr_episode += 1 iter_ctr_valid += 1 state_network = self.preprocessor.process_state_for_network( state) q_values = self.calc_q_values(state_network) Q_sum += np.max(q_values) # todo fix this action = evaluation_policy.select_action(q_values) next_state, reward, is_terminal, _ = self.env.step(action) total_reward_curr_episode += reward # print "Evalution : timestep {}, episode {}, action {}, reward {}, total_reward {}"\ # .format(iter_ctr_valid, eval_episode_ctr_valid, action, reward, total_reward_curr_episode) if is_terminal or (num_timesteps_in_curr_episode > max_episode_length - 1): eval_episode_ctr_valid += 1 str_1 = "Evaluate() : iter_ctr_valid {}, eval_episode_ctr_valid : {}, total_reward_curr_episode : {}, num_timesteps_in_curr_episode {}"\ .format(iter_ctr_valid, eval_episode_ctr_valid, total_reward_curr_episode, num_timesteps_in_curr_episode) msg = "\n%s\n" % (LINE) msg += "%s%s\n" % (BOLD, str_1) msg += "%s\n" % (LINE) print(str(msg)) total_reward_all_episodes.append(total_reward_curr_episode) # num_timesteps_in_curr_episode = 0 break state = next_state Q_avg = Q_sum / float(iter_ctr_valid) print " sum(total_reward_all_episodes) : {} , float(len(total_reward_all_episodes)) : {}".format\ (sum(total_reward_all_episodes), float(len(total_reward_all_episodes))) all_episode_avg_reward = sum(total_reward_all_episodes) / float( len(total_reward_all_episodes)) with tf.name_scope('summaries'): self.tf_log_scaler(tag='test_mean_avg_reward', value=all_episode_avg_reward, step=self.train_iter_ctr) self.tf_log_scaler(tag='test_mean_Q_max', value=Q_avg, step=self.train_iter_ctr) self.dump_test_episode_reward(all_episode_avg_reward) self.qavg_list = np.append(self.qavg_list, Q_avg) self.reward_list.append(all_episode_avg_reward) print "all_episode_avg_reward ", all_episode_avg_reward print "\n\n\n self.reward_list \n\n\n", self.reward_list
def evaluate(self, num_episodes, max_episode_length=None, gen_video=False): """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ evaluation_policy = GreedyPolicy() eval_preprocessor = preprocessors.PreprocessorSequence() env_valid = gym.make(self.env_string) iter_ctr_valid = 0 Q_sum = 0 eval_episode_ctr_valid = 0 total_reward_all_episodes = [] # https://github.com/openai/gym/blob/master/gym/wrappers/monitoring.py video_callable takes function as arg. so we hack with true lambda # https://github.com/openai/gym/issues/494 # if gen_video: # video_dir = os.path.join(self.log_dir, 'gym_monitor', str(self.iter_ctr).zfill(7)) # os.makedirs(video_dir) # env_valid = wrappers.Monitor(env_valid, video_dir, video_callable=lambda x:True, mode='evaluation') while eval_episode_ctr_valid < num_episodes: state = env_valid.reset() eval_preprocessor.reset_history_memory() num_timesteps_in_curr_episode = 0 total_reward_curr_episode = 0.0 while num_timesteps_in_curr_episode < max_episode_length: num_timesteps_in_curr_episode += 1 iter_ctr_valid += 1 state_network = self.preprocessor.process_state_for_network(state) q_values = self.calc_q_values(state_network) Q_sum += np.max(q_values) # todo fix this action = evaluation_policy.select_action(q_values) next_state, reward, is_terminal, _ = env_valid.step(action) total_reward_curr_episode += reward # print "Evalution : timestep {}, episode {}, action {}, reward {}, total_reward {}"\ # .format(iter_ctr_valid, eval_episode_ctr_valid, action, reward, total_reward_curr_episode) if is_terminal or (num_timesteps_in_curr_episode > max_episode_length-1): eval_episode_ctr_valid += 1 print "Evaluate() : iter_ctr_valid {}, eval_episode_ctr_valid : {}, total_reward_curr_episode : {}, num_timesteps_in_curr_episode {}"\ .format(iter_ctr_valid, eval_episode_ctr_valid, total_reward_curr_episode, num_timesteps_in_curr_episode) total_reward_all_episodes.append(total_reward_curr_episode) # num_timesteps_in_curr_episode = 0 break state = next_state Q_avg = Q_sum/float(iter_ctr_valid) print " sum(total_reward_all_episodes) : {} , float(len(total_reward_all_episodes)) : {}".format\ (sum(total_reward_all_episodes), float(len(total_reward_all_episodes))) all_episode_avg_reward = sum(total_reward_all_episodes)/float(len(total_reward_all_episodes)) all_episode_avg_reward_np = np.array(total_reward_all_episodes) mean_calc=np.mean(all_episode_avg_reward_np) std_calc=np.std(all_episode_avg_reward_np) # print all_episode_avg_reward_np.dtype # print all_episode_avg_reward_np.astype(int).std() # print all_episode_avg_reward_np.mean() print "-------" print "MEAN===", mean_calc print "STD===", std_calc print "-------" # with tf.name_scope('summaries'): # self.tf_log_scaler(tag='test_mean_avg_reward', value=all_episode_avg_reward, step=self.iter_ctr) # self.tf_log_scaler(tag='test_mean_Q_max', value=Q_avg, step=self.iter_ctr) # self.dump_test_episode_reward(all_episode_avg_reward) # self.qavg_list = np.append(self.qavg_list, Q_avg) # self.reward_list.append(all_episode_avg_reward) # pkl.dump(self.reward_list, open("/data/datasets/ratneshm/deeprl_hw2/eval_rewards.pkl", "wb")) print "all_episode_avg_reward ", all_episode_avg_reward print "\n\n\n self.reward_list \n\n\n", self.reward_list
def evaluate(self, num_episodes, max_episode_length=None, gen_video=False): """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ if self.is_eval: self.compile() # load save weights evaluation_policy = GreedyPolicy() eval_preprocessor = preprocessors.PreprocessorSequence() # env_valid = gym.make(self.env_string) iter_ctr_valid = 0 Q_sum = 0 eval_episode_ctr_valid = 0 total_reward_all_episodes = [] # https://github.com/openai/gym/blob/master/gym/wrappers/monitoring.py video_callable takes function as arg. so we hack with true lambda # https://github.com/openai/gym/issues/494 if gen_video: video_dir = os.path.join(self.log_dir, 'gym_monitor', str(self.train_iter_ctr).zfill(7)) os.makedirs(video_dir) env_valid = wrappers.Monitor(env_valid, video_dir, video_callable=lambda x: True, mode='evaluation') RED = '\033[91m' BOLD = '\033[1m' ENDC = '\033[0m' LINE = "%s%s##############################################################################%s" % ( RED, BOLD, ENDC) while eval_episode_ctr_valid < num_episodes: state = self.env.reset() eval_preprocessor.reset_history_memory() num_timesteps_in_curr_episode = 0 total_reward_curr_episode = 0.0 while num_timesteps_in_curr_episode < max_episode_length: num_timesteps_in_curr_episode += 1 iter_ctr_valid += 1 state_network = self.preprocessor.process_state_for_network( state) q_values = self.calc_q_values(state_network) Q_sum += np.max(q_values) # todo fix this action = evaluation_policy.select_action(q_values) next_state, reward, is_terminal, _ = self.env.step(action) total_reward_curr_episode += reward # print "Evalution : timestep {}, episode {}, action {}, reward {}, total_reward {}"\ # .format(iter_ctr_valid, eval_episode_ctr_valid, action, reward, total_reward_curr_episode) if is_terminal or (num_timesteps_in_curr_episode > max_episode_length - 1): eval_episode_ctr_valid += 1 str_1 = "Evaluate() : iter_ctr_valid {}, eval_episode_ctr_valid : {}, total_reward_curr_episode : {:.2f}, num_timesteps_in_curr_episode {}"\ .format(iter_ctr_valid, eval_episode_ctr_valid, total_reward_curr_episode, num_timesteps_in_curr_episode) msg = "\n%s\n" % (self.LINE) + "%s%s\n" % ( self.BOLD, str_1) + "%s\n" % (self.LINE) print(str(msg)) total_reward_all_episodes.append(total_reward_curr_episode) # num_timesteps_in_curr_episode = 0 break state = next_state Q_avg = Q_sum / float(iter_ctr_valid) print " sum(total_reward_all_episodes) : {} , float(len(total_reward_all_episodes)) : {}".format\ (sum(total_reward_all_episodes), float(len(total_reward_all_episodes))) all_episode_avg_reward = sum(total_reward_all_episodes) / float( len(total_reward_all_episodes)) with tf.name_scope('summaries'): self.tf_log_scaler(tag='test_mean_avg_reward', value=all_episode_avg_reward, step=self.train_iter_ctr) self.tf_log_scaler(tag='test_mean_Q_max', value=Q_avg, step=self.train_iter_ctr) self.dump_test_episode_reward(all_episode_avg_reward) self.qavg_list = np.append(self.qavg_list, Q_avg) self.reward_list.append(all_episode_avg_reward) print "all_episode_avg_reward ", all_episode_avg_reward print "\n\n\n self.reward_list \n\n\n", self.reward_list