Пример #1
0
    def __init__(self,
                 env,
                 gamma,
                 target_update_freq,
                 num_burn_in,
                 train_freq,
                 batch_size,
                 mode,
                 log_parent_dir = '/data/datasets/ratneshm/deeprl_hw2/q5'):

        self.env_string = env
        self.env = gym.make(env)
        self.num_actions = self.env.action_space.n
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.num_burn_in = num_burn_in
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.iter_ctr = 0

        self.eval_episode_ctr = 0
        self.preprocessor = preprocessors.PreprocessorSequence()

        # loggers
        self.qavg_list = np.array([0])
        self.reward_list = []
        self.loss_log = []
        self.loss_last = None
        self.mode = mode
        self.log_parent_dir = log_parent_dir
        self.make_log_dir() # makes empty dir and logfiles based on current timestamp inside self.log_parent_dir
Пример #2
0
    def __init__(self,
                 env,
                 gamma,
                 target_update_freq,
                 num_burn_in,
                 train_freq,
                 batch_size,
                 mode,
                 eval_dir=None,
                 resume_dir=None,
                 log_parent_dir='/home/vaibhav/madratman/logs/project/dqn'):

        self.env_string = env
        self.env = gym.make(env)
        self.num_actions = self.env.action_space.n
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.num_burn_in = num_burn_in
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.train_iter_ctr = 0

        self.eval_episode_ctr = 0
        self.preprocessor = preprocessors.PreprocessorSequence()
        self.eval_dir = eval_dir
        self.resume_dir = resume_dir
        self.is_eval = False  # eval only mode
        self.is_resume = False

        if eval_dir is not None:
            self.is_eval = True
        if resume_dir is not None:
            self.is_resume = True

        # loggers
        self.qavg_list = np.array([0])
        self.reward_list = []
        self.loss_log = []
        self.loss_last = None
        self.mode = mode
        self.log_parent_dir = log_parent_dir
        self.make_log_dir(
        )  # makes empty dir and logfiles based on current timestamp inside self.log_parent_dir
        self.start_time = timeit.default_timer()

        # printing
        self.RED = '\033[91m'
        self.BOLD = '\033[1m'
        self.ENDC = '\033[0m'
        self.LINE = "%s%s##############################################################################%s" % (
            self.RED, self.BOLD, self.ENDC)
Пример #3
0
    def __init__(self,
                 env,
                 gamma,
                 target_update_freq,
                 num_burn_in,
                 train_freq,
                 batch_size,
                 mode,
                 resume_dir,
                 log_parent_dir='/home/vaibhav/madratman/logs/project/dqn'):

        self.env_string = env
        self.env = gym.make(env)
        self.num_actions = self.env.action_space.n
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.num_burn_in = num_burn_in
        self.train_freq = train_freq
        self.batch_size = batch_size
        if resume_dir is None:
            self.train_iter_ctr = 0

        self.eval_episode_ctr = 0
        self.preprocessor = preprocessors.PreprocessorSequence()

        # loggers
        self.qavg_list = np.array([0])
        self.reward_list = []
        self.loss_log = []
        self.loss_last = None
        self.mode = mode
        self.log_parent_dir = log_parent_dir
        # self.make_log_dir() # makes empty dir and logfiles based on current timestamp inside self.log_parent_dir
        if resume_dir is not None:
            print "resuming from ", resume_dir
            self.resume_from_log_dir(resume_dir)
            self.is_resume = True
        print "self.is_resume", self.is_resume
Пример #4
0
    def evaluate(self, num_episodes, max_episode_length=None, gen_video=False):
        self.compile(self.is_resume)
        evaluation_policy = GreedyPolicy()
        eval_preprocessor = preprocessors.PreprocessorSequence()
        # env_valid = gym.make(self.env_string)

        iter_ctr_valid = 0
        Q_sum = 0
        eval_episode_ctr_valid = 0
        total_reward_all_episodes = []

        # https://github.com/openai/gym/blob/master/gym/wrappers/monitoring.py video_callable takes function as arg. so we hack with true lambda
        # https://github.com/openai/gym/issues/494
        if gen_video:
            video_dir = os.path.join(self.log_dir, 'gym_monitor',
                                     str(self.train_iter_ctr).zfill(7))
            os.makedirs(video_dir)
            env_valid = wrappers.Monitor(env_valid,
                                         video_dir,
                                         video_callable=lambda x: True,
                                         mode='evaluation')
        RED = '\033[91m'
        BOLD = '\033[1m'
        ENDC = '\033[0m'
        LINE = "%s%s##############################################################################%s" % (
            RED, BOLD, ENDC)

        while eval_episode_ctr_valid < num_episodes:
            state = self.env.reset()
            eval_preprocessor.reset_history_memory()
            num_timesteps_in_curr_episode = 0
            total_reward_curr_episode = 0.0

            while num_timesteps_in_curr_episode < max_episode_length:
                num_timesteps_in_curr_episode += 1
                iter_ctr_valid += 1

                state_network = self.preprocessor.process_state_for_network(
                    state)
                q_values = self.calc_q_values(state_network)
                Q_sum += np.max(q_values)  # todo fix this

                action = evaluation_policy.select_action(q_values)
                next_state, reward, is_terminal, _ = self.env.step(action)
                total_reward_curr_episode += reward
                # print "Evalution : timestep {}, episode {}, action {}, reward {}, total_reward {}"\
                # .format(iter_ctr_valid, eval_episode_ctr_valid, action, reward, total_reward_curr_episode)

                if is_terminal or (num_timesteps_in_curr_episode >
                                   max_episode_length - 1):
                    eval_episode_ctr_valid += 1
                    str_1 = "Evaluate() : iter_ctr_valid {}, eval_episode_ctr_valid : {}, total_reward_curr_episode : {}, num_timesteps_in_curr_episode {}"\
                            .format(iter_ctr_valid, eval_episode_ctr_valid, total_reward_curr_episode, num_timesteps_in_curr_episode)
                    msg = "\n%s\n" % (LINE)
                    msg += "%s%s\n" % (BOLD, str_1)
                    msg += "%s\n" % (LINE)
                    print(str(msg))

                    total_reward_all_episodes.append(total_reward_curr_episode)
                    # num_timesteps_in_curr_episode = 0
                    break

                state = next_state

        Q_avg = Q_sum / float(iter_ctr_valid)
        print " sum(total_reward_all_episodes) : {} , float(len(total_reward_all_episodes)) : {}".format\
                (sum(total_reward_all_episodes), float(len(total_reward_all_episodes)))
        all_episode_avg_reward = sum(total_reward_all_episodes) / float(
            len(total_reward_all_episodes))
        with tf.name_scope('summaries'):
            self.tf_log_scaler(tag='test_mean_avg_reward',
                               value=all_episode_avg_reward,
                               step=self.train_iter_ctr)
            self.tf_log_scaler(tag='test_mean_Q_max',
                               value=Q_avg,
                               step=self.train_iter_ctr)
        self.dump_test_episode_reward(all_episode_avg_reward)
        self.qavg_list = np.append(self.qavg_list, Q_avg)
        self.reward_list.append(all_episode_avg_reward)

        print "all_episode_avg_reward ", all_episode_avg_reward
        print "\n\n\n self.reward_list \n\n\n", self.reward_list
Пример #5
0
    def evaluate(self, num_episodes, max_episode_length=None, gen_video=False):
        """Test your agent with a provided environment.
        
        You shouldn't update your network parameters here. Also if you
        have any layers that vary in behavior between train/test time
        (such as dropout or batch norm), you should set them to test.

        Basically run your policy on the environment and collect stats
        like cumulative reward, average episode length, etc.

        You can also call the render function here if you want to
        visually inspect your policy.
        """
        evaluation_policy = GreedyPolicy()
        eval_preprocessor = preprocessors.PreprocessorSequence()
        env_valid = gym.make(self.env_string)

        iter_ctr_valid = 0
        Q_sum = 0
        eval_episode_ctr_valid = 0
        total_reward_all_episodes = []
  
        # https://github.com/openai/gym/blob/master/gym/wrappers/monitoring.py video_callable takes function as arg. so we hack with true lambda
        # https://github.com/openai/gym/issues/494  
        # if gen_video:
        #     video_dir = os.path.join(self.log_dir, 'gym_monitor', str(self.iter_ctr).zfill(7))
        #     os.makedirs(video_dir)
        #     env_valid = wrappers.Monitor(env_valid, video_dir, video_callable=lambda x:True, mode='evaluation')

        while eval_episode_ctr_valid < num_episodes:
            state = env_valid.reset()
            eval_preprocessor.reset_history_memory()
            num_timesteps_in_curr_episode = 0
            total_reward_curr_episode = 0.0

            while num_timesteps_in_curr_episode < max_episode_length:
                num_timesteps_in_curr_episode += 1
                iter_ctr_valid += 1

                state_network = self.preprocessor.process_state_for_network(state)
                q_values = self.calc_q_values(state_network)
                Q_sum += np.max(q_values) # todo fix this

                action = evaluation_policy.select_action(q_values)
                next_state, reward, is_terminal, _ = env_valid.step(action)
                total_reward_curr_episode += reward
                # print "Evalution : timestep {}, episode {}, action {}, reward {}, total_reward {}"\
                        # .format(iter_ctr_valid, eval_episode_ctr_valid, action, reward, total_reward_curr_episode)

                if is_terminal or (num_timesteps_in_curr_episode > max_episode_length-1):
                    eval_episode_ctr_valid += 1
                    print "Evaluate() : iter_ctr_valid {}, eval_episode_ctr_valid : {}, total_reward_curr_episode : {}, num_timesteps_in_curr_episode {}"\
                            .format(iter_ctr_valid, eval_episode_ctr_valid, total_reward_curr_episode, num_timesteps_in_curr_episode)
                    total_reward_all_episodes.append(total_reward_curr_episode)
                    # num_timesteps_in_curr_episode = 0
                    break

                state = next_state

        Q_avg = Q_sum/float(iter_ctr_valid)
        print " sum(total_reward_all_episodes) : {} , float(len(total_reward_all_episodes)) : {}".format\
                (sum(total_reward_all_episodes), float(len(total_reward_all_episodes)))

        all_episode_avg_reward = sum(total_reward_all_episodes)/float(len(total_reward_all_episodes))

        all_episode_avg_reward_np = np.array(total_reward_all_episodes)
        mean_calc=np.mean(all_episode_avg_reward_np)
        std_calc=np.std(all_episode_avg_reward_np)
        # print all_episode_avg_reward_np.dtype
        # print all_episode_avg_reward_np.astype(int).std()
        # print all_episode_avg_reward_np.mean()

        print "-------"
        print "MEAN===", mean_calc
        print "STD===", std_calc
        print "-------"

        # with tf.name_scope('summaries'):
        #     self.tf_log_scaler(tag='test_mean_avg_reward', value=all_episode_avg_reward, step=self.iter_ctr)
        #     self.tf_log_scaler(tag='test_mean_Q_max', value=Q_avg, step=self.iter_ctr)
        # self.dump_test_episode_reward(all_episode_avg_reward)
        # self.qavg_list = np.append(self.qavg_list, Q_avg)
        # self.reward_list.append(all_episode_avg_reward)

        # pkl.dump(self.reward_list, open("/data/datasets/ratneshm/deeprl_hw2/eval_rewards.pkl", "wb"))
        
        print "all_episode_avg_reward ", all_episode_avg_reward
        print "\n\n\n self.reward_list \n\n\n", self.reward_list
Пример #6
0
    def evaluate(self, num_episodes, max_episode_length=None, gen_video=False):
        """Test your agent with a provided environment.
        
        You shouldn't update your network parameters here. Also if you
        have any layers that vary in behavior between train/test time
        (such as dropout or batch norm), you should set them to test.

        Basically run your policy on the environment and collect stats
        like cumulative reward, average episode length, etc.

        You can also call the render function here if you want to
        visually inspect your policy.
        """
        if self.is_eval:
            self.compile()  # load save weights

        evaluation_policy = GreedyPolicy()
        eval_preprocessor = preprocessors.PreprocessorSequence()
        # env_valid = gym.make(self.env_string)

        iter_ctr_valid = 0
        Q_sum = 0
        eval_episode_ctr_valid = 0
        total_reward_all_episodes = []

        # https://github.com/openai/gym/blob/master/gym/wrappers/monitoring.py video_callable takes function as arg. so we hack with true lambda
        # https://github.com/openai/gym/issues/494
        if gen_video:
            video_dir = os.path.join(self.log_dir, 'gym_monitor',
                                     str(self.train_iter_ctr).zfill(7))
            os.makedirs(video_dir)
            env_valid = wrappers.Monitor(env_valid,
                                         video_dir,
                                         video_callable=lambda x: True,
                                         mode='evaluation')
        RED = '\033[91m'
        BOLD = '\033[1m'
        ENDC = '\033[0m'
        LINE = "%s%s##############################################################################%s" % (
            RED, BOLD, ENDC)

        while eval_episode_ctr_valid < num_episodes:
            state = self.env.reset()
            eval_preprocessor.reset_history_memory()
            num_timesteps_in_curr_episode = 0
            total_reward_curr_episode = 0.0

            while num_timesteps_in_curr_episode < max_episode_length:
                num_timesteps_in_curr_episode += 1
                iter_ctr_valid += 1

                state_network = self.preprocessor.process_state_for_network(
                    state)
                q_values = self.calc_q_values(state_network)
                Q_sum += np.max(q_values)  # todo fix this

                action = evaluation_policy.select_action(q_values)
                next_state, reward, is_terminal, _ = self.env.step(action)
                total_reward_curr_episode += reward
                # print "Evalution : timestep {}, episode {}, action {}, reward {}, total_reward {}"\
                # .format(iter_ctr_valid, eval_episode_ctr_valid, action, reward, total_reward_curr_episode)

                if is_terminal or (num_timesteps_in_curr_episode >
                                   max_episode_length - 1):
                    eval_episode_ctr_valid += 1
                    str_1 = "Evaluate() : iter_ctr_valid {}, eval_episode_ctr_valid : {}, total_reward_curr_episode : {:.2f}, num_timesteps_in_curr_episode {}"\
                            .format(iter_ctr_valid, eval_episode_ctr_valid, total_reward_curr_episode, num_timesteps_in_curr_episode)
                    msg = "\n%s\n" % (self.LINE) + "%s%s\n" % (
                        self.BOLD, str_1) + "%s\n" % (self.LINE)
                    print(str(msg))

                    total_reward_all_episodes.append(total_reward_curr_episode)
                    # num_timesteps_in_curr_episode = 0
                    break

                state = next_state

        Q_avg = Q_sum / float(iter_ctr_valid)
        print " sum(total_reward_all_episodes) : {} , float(len(total_reward_all_episodes)) : {}".format\
                (sum(total_reward_all_episodes), float(len(total_reward_all_episodes)))
        all_episode_avg_reward = sum(total_reward_all_episodes) / float(
            len(total_reward_all_episodes))
        with tf.name_scope('summaries'):
            self.tf_log_scaler(tag='test_mean_avg_reward',
                               value=all_episode_avg_reward,
                               step=self.train_iter_ctr)
            self.tf_log_scaler(tag='test_mean_Q_max',
                               value=Q_avg,
                               step=self.train_iter_ctr)
        self.dump_test_episode_reward(all_episode_avg_reward)
        self.qavg_list = np.append(self.qavg_list, Q_avg)
        self.reward_list.append(all_episode_avg_reward)

        print "all_episode_avg_reward ", all_episode_avg_reward
        print "\n\n\n self.reward_list \n\n\n", self.reward_list