예제 #1
0
class Agent:
    def __init__(self, state_dim, action_dim, explore_noise = "Gaussian", *args, **kwargs):
        self.lr = 1e-4
        self.gamma = 0.99
        self.tau = 0.005
        self.bs = 512
        self.bfs = 1000000
        self.d = 2
        self.explore_noise = explore_noise
        self.explore_noise_size = 0.1 # or 0.01
        self.process_noise_generator = ProcessNoise(action_dim)
        self.criticreg_noise_size = 0.2
        self.criticreg_noise_clip = 0.5

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.actor_nn_dim = [256, 256, self.action_dim]
        self.critic_nn_dim = [256, 256, 1]

        self.state1_place = tf.placeholder(tf.float32, [None, self.state_dim])
        self.action_place = tf.placeholder(tf.float32, [None, self.action_dim])
        self.reward_place = tf.placeholder(tf.float32, [None,1])
        self.isdone_place = tf.placeholder(tf.float32, [None,1])
        self.state2_place = tf.placeholder(tf.float32, [None, self.state_dim])


        with tf.variable_scope("target_actor", reuse = tf.AUTO_REUSE):
            self.Q_next_action = self.actor_nn(self.state2_place)
        self.Q_next_noise = tf.clip_by_value(tf.random.normal([self.bs, self.action_dim], 0, self.criticreg_noise_size),
                         - self.criticreg_noise_clip, self.criticreg_noise_clip)
        self.Q_next_noisy_action = tf.clip_by_value(self.Q_next_action + self.Q_next_noise, -1, 1)
        with tf.variable_scope("target_critic_1", reuse = tf.AUTO_REUSE):
            self.Q_critic_1 = self.critic_nn(self.state2_place, self.Q_next_noisy_action)
        with tf.variable_scope("target_critic_2", reuse = tf.AUTO_REUSE):
            self.Q_critic_2 = self.critic_nn(self.state2_place, self.Q_next_noisy_action)
        self.Q_critic_min = tf.minimum(self.Q_critic_1, self.Q_critic_2)
        self.Q_y = self.reward_place + self.gamma * (1-self.isdone_place) * self.Q_critic_min
        with tf.variable_scope("main_critic_1", reuse = tf.AUTO_REUSE):
            self.Q_Q_1 = self.critic_nn(self.state1_place, self.action_place)
        with tf.variable_scope("main_critic_2", reuse = tf.AUTO_REUSE):
            self.Q_Q_2 = self.critic_nn(self.state1_place, self.action_place) 
        self.Q_loss = tf.reduce_mean((self.Q_Q_1 - self.Q_y)**2) + tf.reduce_mean((self.Q_Q_2 - self.Q_y)**2)

        with tf.variable_scope("main_actor", reuse = tf.AUTO_REUSE):
            self.P_this_action = self.actor_nn(self.state1_place)
        with tf.variable_scope("main_critic_1", reuse = tf.AUTO_REUSE):
            self.P_Q_1 = self.critic_nn(self.state1_place, self.P_this_action)
        self.P_loss = - tf.reduce_mean(self.P_Q_1)

        with tf.variable_scope("main_actor", reuse = tf.AUTO_REUSE):
            self.action = self.actor_nn(self.state1_place)


        all_variables = tf.trainable_variables()
        self.main_critic_var = [i for i in all_variables if "main_critic" in i.name]
        self.target_critic_var = [i for i in all_variables if "target_critic" in i.name]
        self.main_actor_var = [i for i in all_variables if "main_actor" in i.name]
        self.target_actor_var = [i for i in all_variables if "target_actor" in i.name]

        assert len(self.main_critic_var) == len(self.target_critic_var)
        assert len(self.main_actor_var) == len(self.target_actor_var)

        self.Q_op =  tf.train.AdamOptimizer(self.lr).minimize(self.Q_loss, var_list = self.main_critic_var) 
        self.P_op =  tf.train.AdamOptimizer(self.lr).minimize(self.P_loss, var_list = self.main_actor_var) 
        self.T_init = [tf.assign(T, M) for (T,M) in zip(self.target_critic_var + self.target_actor_var, 
                                                        self.main_critic_var + self.main_actor_var)]
        self.T_op = [tf.assign(T, self.tau * M + (1 - self.tau) * T) for (T,M) in zip(
            self.target_critic_var + self.target_actor_var, self.main_critic_var + self.main_actor_var)]


        self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs)

        self.step_count = 0
        self.total_step_count = 0
        self.episode_count = 0
        self.train_count = 0

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.T_init)
        self.saver = tf.train.Saver(max_to_keep=1000)


    def actor_nn(self, state, bound = True):
        dim = self.actor_nn_dim
        A = state
        for i in range(0,len(dim)-1):
            A = tf.layers.dense(A, units= dim[i], activation = tf.nn.relu)
        action = tf.layers.dense(A, units= dim[-1], activation = tf.nn.tanh)
        return action


    def critic_nn(self, state, action):
        dim = self.critic_nn_dim
        A = tf.concat([state, action], axis = 1)
        for i in range(0,len(dim)-1):
            A = tf.layers.dense(A, units= dim[i], activation = tf.nn.relu)
        critic = tf.layers.dense(A, units= dim[-1], activation = None)
        return critic



    def get_action(self, state_data, stochastic = True):
        this_action = self.sess.run(self.action, feed_dict= {self.state1_place: state_data})
        if stochastic:
            if self.explore_noise == "Gaussian":
                explore_noise = np.random.normal(0, self.explore_noise_size, [1, self.action_dim])
            elif self.explore_noise == "Process":
                explore_noise = self.explore_noise_size * self.process_noise_generator.next()
            else:
                raise NotImplementedError
            this_action = np.clip(this_action + explore_noise, -1, 1)
        return this_action

    def eval_loss(self, bs = None):
        if bs is None:
            bs = self.bs
        this_bs = np.minimum(bs, self.replay_buffer.size)
        this_batch = self.replay_buffer.sample_batch(this_bs)
        feed_dict = {self.state1_place: this_batch["obs1"],
                     self.action_place: this_batch["acts"],
                     self.reward_place: this_batch["rews"],
                     self.isdone_place: this_batch["done"],
                     self.state2_place: this_batch["obs2"]}
        pass

    def train_iter(self):

        if self.bs <= self.replay_buffer.size:
            this_batch = self.replay_buffer.sample_batch(self.bs)
            
            feed_dict = {self.state1_place: this_batch["obs1"],
                        self.action_place: this_batch["acts"],
                        self.reward_place: this_batch["rews"],
                        self.isdone_place: this_batch["done"],
                        self.state2_place: this_batch["obs2"]}

            self.sess.run([self.Q_op], feed_dict=feed_dict)
            if self.total_step_count % self.d == 0:
                self.sess.run([self.P_op], feed_dict=feed_dict)
                self.sess.run(self.T_op)

            self.train_count += 1
            self.total_step_count += 1


    def record(self, this_state, this_action, this_reward, this_done, next_state):
        self.replay_buffer.store(obs=this_state, 
                                act=this_action,
                                rew=this_reward,
                                next_obs=next_state,
                                done=this_done)
        self.step_count += 1


    def reset_agent(self):

        self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs)

        self.step_count = 0
        self.total_step_count = 0
        self.train_count = 0
        self.episode_count = 0

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.T_init)

    def reset_episode(self):

        self.step_count = 0
        self.train_count = 0
        self.episode_count += 1
예제 #2
0
def main(config):
    env_name = config['run']['env']
    env = gym.make(env_name)
    np.random.seed(config['random_seed'])
    tf.random.set_seed(config['random_seed'])
    env.seed(config['random_seed'])

    batch_size = config['train']['batch_size']
    state_dim = env.observation_space.shape

    # Use action_dim[0]: (a_dim,) --> a_dim
    action_dim = env.action_space.shape[0]

    # Define action boundaries for continuous but bounded action space
    action_low = env.action_space.low
    action_high = env.action_space.high

    print(f'-------- {env_name} --------')
    print('STATE DIM: ', state_dim)
    print('ACTION DIM: ', action_dim)
    print('ACTION LOW: ', action_low)
    print('ACTION HIGH: ', action_high)
    print('----------------------------')

    # Initialize memory for experience replay
    replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'],
                                 config['random_seed'])

    # Take a random action in the environment to initialize networks
    env.reset()
    _, initial_reward, _, _ = env.step(env.action_space.sample())

    # Use agent_factory to build the agent using the algorithm specified in the config file
    Agent = agent_factory(config['agent']['model'])
    agent = Agent(config, state_dim, action_dim, action_low, action_high,
                  initial_reward)

    for episode in range(int(config['train']['max_episodes'])):
        s = env.reset()
        s = s / 255.0

        episode_reward = 0
        episode_average_max_q = 0

        for step in range(int(config['train']['max_episode_len'])):
            if config['run']['render_env'] == True:
                env.render()

            # 1. Use current behavioural policy network to predict an action to take
            # TODO: the [0] works for new SAC. Check again with DDPG updates.
            a = agent.actor.model.predict(np.expand_dims(s, 0))[0]

            # print('ACTION: ', a)
            # print('a[0]: ', a[0])

            # 2. Use action to take step in environment and receive next step, reward, etc.
            s2, r, terminal, info = env.step(a[0])
            s2 = s2 / 255.0

            # 3. Update the replay buffer with the most recent experience
            replay_buffer.add(np.reshape(s, state_dim),
                              np.reshape(a, action_dim), r,
                              np.reshape(s2, state_dim), terminal)

            # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences
            if replay_buffer.size() > batch_size:
                experience = replay_buffer.sample_batch(batch_size)

                # Train current behavioural networks
                # predicted_Q_value = agent.train_networks(experience)
                loss_actor, criticQ, criticV = agent.train_networks(experience)

                # Update for logging
                # episode_average_max_q += np.amax(predicted_Q_value)

                # Soft update of frozen target networks
                agent.update_target_networks()

            # Update information for next step
            s = s2
            episode_reward += r

            if terminal:
                print(
                    f'Epoch {epoch} training losses: ACTOR: {loss_actor} | CRITIC_Q: {criticQ} | CRITIC_V: {criticV}'
                )
                # print(f'| Reward: {int(episode_reward)} | Episode: {episode} | Qmax: {episode_average_max_q / float(step)}')
                break

    if config['run']['use_gym_monitor'] == True:
        env.monitor.close()
예제 #3
0
def main(config):
    tf.compat.v1.reset_default_graph()
    env_name = config['run']['env']
    env = gym.make(env_name)
    np.random.seed(config['random_seed'])
    tf.compat.v1.set_random_seed(config['random_seed'])
    env.seed(config['random_seed'])

    batch_size = config['train']['batch_size']
    state_dim = env.observation_space.shape

    # Use action_dim[0]: (a_dim,) --> a_dim
    action_dim = env.action_space.shape[0]

    # Define action boundaries for continuous but bounded action space
    action_low = env.action_space.low
    action_high = env.action_space.high

    print(f'-------- {env_name} --------')
    print('STATE DIM: ', state_dim)
    print('ACTION DIM: ', action_dim)
    print('ACTION LOW: ', action_low)
    print('ACTION HIGH: ', action_high)
    print('----------------------------')
    
    # Initialize memory for experience replay
    replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'], config['random_seed'])
        
    # Set up summary TF operations
    summary_ops, summary_vars = build_summaries()

    with tf.compat.v1.Session() as sess:
        # sess.run(tf.compat.v1.global_variables_initializer())
        writer = tf.compat.v1.summary.FileWriter(config['output']['summary_dir'], sess.graph)

        # Use agent_factory to build the agent using the algorithm specified in the config file.
        Agent = agent_factory(config['agent']['model'])
        agent = Agent(config, state_dim, action_dim, action_low, action_high, sess)

        sess.run(tf.compat.v1.global_variables_initializer())

        for i in range(int(config['train']['max_episodes'])):
            s = env.reset()
            episode_reward = 0
            episode_average_max_q = 0

            for j in range(int(config['train']['max_episode_len'])):
                if config['run']['render_env'] == True:
                    env.render()

                # 1. Predict an action to take
                a = agent.actor.predict_action(np.expand_dims(s, 0))

                # 2. Use action to take step in environment and receive next step, reward, etc.
                s2, r, terminal, info = env.step(a[0])

                # 3. Update the replay buffer with the most recent experience
                replay_buffer.add(np.reshape(s, state_dim), np.reshape(a, action_dim), r,
                                  np.reshape(s2, state_dim), terminal)

                # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences
                if replay_buffer.size() > batch_size:
                    experience = replay_buffer.sample_batch(batch_size)

                    # Train current behavioural networks
                    predicted_Q_value = agent.train_networks(experience)

                    # Update for logging
                    episode_average_max_q += np.amax(predicted_Q_value)

                    # Update target networks
                    agent.update_target_networks()

                # Update information for next step
                s = s2
                episode_reward += r

                if terminal:
                    summary_str = sess.run(summary_ops, feed_dict={
                        summary_vars[0]: episode_reward,
                        summary_vars[1]: episode_average_max_q / float(j)
                        })

                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(episode_reward), i, (episode_average_max_q / float(j))))
                    
                    break

    if config['run']['use_gym_monitor'] == True:
        env.monitor.close()
예제 #4
0
class SAC:
    def __init__(self,
                 env,
                 test_env,
                 actor_critic=core.MLPActorCritic,
                 ac_kwargs=dict(),
                 seed=0,
                 steps_per_epoch=4000,
                 epochs=100,
                 replay_size=int(1e6),
                 gamma=0.99,
                 polyak=0.995,
                 entropy_tuning: bool = False,
                 lr=1e-3,
                 alpha=0.2,
                 batch_size=100,
                 start_steps=10000,
                 update_after=1000,
                 update_every=50,
                 act_noise=0.01,
                 max_ep_len=1000,
                 device='cpu',
                 num_test_episodes=1,
                 save_freq=2,
                 log_mode: List[str] = ["stdout"],
                 log_key: str = "timestep",
                 save_model: str = "checkpoints",
                 checkpoint_path: str = None,
                 log_interval: int = 10,
                 load_model=False,
                 dir_prefix: str = None):

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.seed = seed
        self.env = env
        self.test_env = test_env
        self.obs_dim = env.observation_space.shape
        self.act_dim = env.action_space.shape[0]
        self.act_limit = env.action_space.high[0]
        self.replay_size = replay_size
        self.batch_size = batch_size
        #self.noise_scale = act_noise

        self.load_model = load_model
        self.log_key = log_key
        #self.logdir = logdir
        self.save_model = save_model
        self.checkpoint_path = checkpoint_path
        #self.log_interval = log_interval
        #self.logger = Logger(logdir=logdir, formats=[*log_mode])

        #self.pi_lr = pi_lr
        #self.q_lr = q_lr
        self.lr = lr
        self.ac_kwargs = ac_kwargs

        self.steps_per_epoch = steps_per_epoch
        self.epochs = epochs
        self.max_ep_len = max_ep_len

        self.gamma = gamma
        self.polyak = polyak
        self.alpha = alpha
        self.entropy_tuning = entropy_tuning

        self.start_steps = start_steps
        self.update_after = update_after
        self.update_every = update_every
        self.save_freq = save_freq

        self.action_time_step = 0  #no. of updates
        self.current_timestep = 0
        self.current_epoch = 0
        self.dir_prefix = dir_prefix

        # Store the weights and scores in a new directory
        self.directory = "logs/sac_single_Agent_{}{}/".format(
            self.dir_prefix,
            time.strftime("%Y%m%d-%H%M%S"))  # appends the timedate
        os.makedirs(self.directory, exist_ok=True)
        self.model_dir = os.path.join(self.directory, 'model_param/')
        os.makedirs(self.model_dir)

        # Tensorboard writer object
        self.writer = SummaryWriter(log_dir=self.directory + 'tensorboard/')
        print("Logging to {}\n".format(self.directory + 'tensorboard/'))

        #self.test_env = env
        self.num_test_episodes = num_test_episodes

        # Create actor-critic module and target networks
        self.ac = actor_critic(self.env.observation_space,
                               self.env.action_space,
                               **ac_kwargs).to(self.device)
        self.ac_targ = deepcopy(self.ac).to(self.device)
        #actually no need of saving the policy parameters as target above, since we do not need any target Actor in SAC.

        if self.load_model:
            if os.path.exists(self.checkpoint_path):
                self.ac.load_state_dict(
                    torch.load(os.path.abspath(self.checkpoint_path)))
                self.ac_targ = deepcopy(self.ac).to(self.device)

        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.ac_targ.parameters():
            p.requires_grad = False

        # List of parameters for both Q-networks (save this for convenience)
        self.q_params = itertools.chain(self.ac.q1.parameters(),
                                        self.ac.q2.parameters())

        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.lr)
        self.pi_scheduler = StepLR(self.pi_optimizer, step_size=1, gamma=0.96)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.q_scheduler = StepLR(self.q_optimizer, step_size=1, gamma=0.96)

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim,
                                          act_dim=self.act_dim,
                                          size=self.replay_size)

        # from https://github.com/SforAiDl/genrl/blob/master/genrl/deep/agents/sac/sac.py
        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

        #else:
        #    self.alpha=self.alpha

        # no need of action scales setting
        # action_limit is directly obtained within the MLPActorCritic class
        # action_bias is not need as for the city learn environment, actions are bounded with -1/3 to +1/3
        # and the bias sums to 0

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        #initialize logs
        self.empty_logs()

        # Count variables (protip: try to get a feel for how different size networks behave!)
        var_counts = tuple(
            core.count_vars(module)
            for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        print(var_counts)
        self.logs["var_counts"] = var_counts
        print(
            colorize(
                '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
                var_counts,
                'green',
                bold=True))
        self.writer.add_scalar('Number of parameters/pi', var_counts[0])
        self.writer.add_scalar('Number of parameters/q1', var_counts[1])
        self.writer.add_scalar('Number of parameters/q2', var_counts[2])
        #print(colorize(msg, color, bold=True))

    def load_weights(self, weights) -> None:
        """
        Load weights for the agent from pretrained model
        """
        self.q1.load_state_dict(weights["q1_weights"])
        self.q2.load_state_dict(weights["q2_weights"])
        self.policy.load_state_dict(weights["policy_weights"])

    def empty_logs(self):
        """
        Empties logs
        """
        self.logs = {}
        self.logs["q1_loss"] = []
        self.logs["q2_loss"] = []
        self.logs["policy_loss"] = []
        self.logs["alpha_loss"] = []
        self.logs["var_counts"] = ()

    def safe_mean(log: List[int]):
        """
        Returns 0 if there are no elements in logs
        """
        return np.mean(log) if len(log) > 0 else 0

    def get_logging_params(self) -> Dict[str, Any]:
        """
        :returns: Logging parameters for monitoring training
        :rtype: dict
        """
        logs = {
            "policy_loss": safe_mean(self.logs["policy_loss"]),
            "q1_loss": safe_mean(self.logs["q1_loss"]),
            "q2_loss": safe_mean(self.logs["q2_loss"]),
            "alpha_loss": safe_mean(self.logs["alpha_loss"]),
        }

        self.empty_logs()
        return logs

    # Set up function for computing SAC Q-losses
    def compute_loss_q(self, data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = self.ac.q1(o, a)
        q2 = self.ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = self.ac.pi(o2)

            # Target Q-values
            q1_pi_targ = self.ac_targ.q1(o2, a2)
            q2_pi_targ = self.ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + self.gamma * (1 - d) * (q_pi_targ -
                                                 self.alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        #logging into tensorboard
        self.writer.add_scalar('loss/Critic1_loss', loss_q1,
                               self.current_timestep)
        self.writer.add_scalar('loss/Critic2_loss', loss_q2,
                               self.current_timestep)

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        self.logs["q1_loss"].append(loss_q1.item())
        self.logs["q2_loss"].append(loss_q2.item())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(self, data):
        o = data['obs']
        pi, logp_pi = self.ac.pi(o)
        q1_pi = self.ac.q1(o, pi)
        q2_pi = self.ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (self.alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        # alpha loss
        alpha_loss = torch.tensor(0.0).to(self.device)

        if self.entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (logp_pi + self.target_entropy).detach()).mean()
            self.writer.add_scalar('loss/entropy_tuning_loss', alpha_loss,
                                   self.current_timestep)
            self.logs["alpha_loss"].append(alpha_loss.item())
        else:
            alpha_loss = 0

        #logging into tensorboard
        self.writer.add_scalar('loss/Actor_loss', loss_pi,
                               self.current_timestep)

        self.logs["policy_loss"].append(loss_pi.item())

        return loss_pi, alpha_loss, pi_info

    def update(self, data):
        # First run one gradient descent step for Q1 and Q2
        self.q_optimizer.zero_grad()
        loss_q, q_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in self.q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        self.pi_optimizer.zero_grad()
        loss_pi, alpha_loss, pi_info = self.compute_loss_pi(data)
        loss_pi.backward()
        self.pi_optimizer.step()

        if self.entropy_tuning:
            # Next run one gradient descent step for alpha.
            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()

            self.writer.add_scalar('entropy_tuning_param/alpha', self.alpha,
                                   self.current_timestep)

        # Unfreeze Q-network so you can optimize it at the next SAC step.
        for p in self.q_params:
            p.requires_grad = True

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(self.ac.parameters(),
                                 self.ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(self.polyak)
                p_targ.data.add_((1 - self.polyak) * p.data)

    def reset_action_tracker(self):
        self.action_tracker = []

    def reset_reward_tracker(self):
        self.reward_tracker = []

    def get_action(self, o, deterministic=False):
        return self.ac.act(
            torch.as_tensor(o, dtype=torch.float32).to(self.device),
            deterministic)

    def eval_agent(self, test=True):
        if test == True:
            eval_env = self.test_env
            t_env = 'testing environment'
        else:
            eval_env = deepcopy(self.env)
            t_env = 'training environment'
        ep_rews = []
        for j in range(self.num_test_episodes):
            o, d, ep_ret, ep_len = eval_env.reset(), False, 0, 0
            while not (d or (ep_len == self.max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                # o = (o-self.replay_buffer.obs_buf_min) /(self.replay_buffer.obs_buf_max - self.replay_buffer.obs_buf_min)
                nom = o - self.replay_buffer.obs_buf_min
                denom = self.replay_buffer.obs_buf_max - self.replay_buffer.obs_buf_min
                denom[denom == 0] = 1
                o = nom / denom
                o, r, d, _ = eval_env.step(self.get_action(o, True))
                ep_ret += r
                ep_len += 1
            ep_rews.append(ep_ret)

        print("Evaluating on the {} for {} episode, Mean Reward: {}".format(
            t_env, self.num_test_episodes, np.mean(ep_rews)))
        #print('Final cost',eval_env.cost())

        self.writer.add_scalar("Scores/ramping",
                               eval_env.cost()['ramping'], self.current_epoch)
        self.writer.add_scalar("Scores/1-load_factor",
                               eval_env.cost()['1-load_factor'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/average_daily_peak",
                               eval_env.cost()['average_daily_peak'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/peak_demand",
                               eval_env.cost()['peak_demand'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/net_electricity_consumption",
                               eval_env.cost()['net_electricity_consumption'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/total",
                               eval_env.cost()['total'], self.current_epoch)
        self.writer.add_scalar("Scores/test_episode_reward", np.mean(ep_rews),
                               self.current_epoch)

        return np.mean(ep_rews), eval_env.cost()['total']

    def learn(self) -> None:

        ep_num = 0
        best_score = 1.5
        return_per_episode = []

        # Prepare for interaction with environment
        total_steps = self.steps_per_epoch * self.epochs
        epoch_start_time = time.time()
        o, ep_ret, ep_len = self.env.reset(), 0, 0
        #self.current_epoch=1
        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):

            self.current_timestep = t  #for logging

            # if t > 8759 update minmax of buffer and use it to normalize
            # so,we collect data for 1 year and calculate min-max of obs and rewards

            if t == self.start_steps:
                self.replay_buffer.collect_minmax()

            # Until start_steps have elapsed, randomly sample actions
            # from a uniform distribution for better exploration. Afterwards,
            # use the learned policy.
            if t > self.start_steps:
                #print(t)
                a = self.get_action(o)
            else:
                a = self.env.action_space.sample()

            # Step the env
            o2, r, d, _ = self.env.step(a)
            self.writer.add_scalar('Rewards/single_Agent_reward', r,
                                   self.current_timestep)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len == self.max_ep_len else d

            # Store experience to replay buffer
            self.replay_buffer.store(o, a, r, o2, d)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            o = o2

            # End of trajectory handling
            if d or (ep_len == self.max_ep_len):
                #print('End of trajectory: Episode return is', ep_ret )
                #print('Cost function is', self.env.cost())
                ep_num += 1
                return_per_episode.append(ep_ret)
                self.writer.add_scalar('Rewards/return_per_episode', ep_ret,
                                       ep_num)

                o, ep_ret, ep_len = self.env.reset(), 0, 0

            # Update handling
            if t >= self.update_after and t % self.update_every == 0:
                #if t >= self.update_after: #instead of updating for some fixed steps, update for every step
                #print('updating')
                for _ in range(self.update_every):
                    batch = self.replay_buffer.sample_batch(self.batch_size)
                    #print(batch)
                    #print(batch.size)
                    #sys.exit()
                    self.update(data=batch)

            #End of epoch handling
            if (t + 1) % self.steps_per_epoch == 0:
                epoch = (t + 1) // self.steps_per_epoch

                self.current_epoch += 1
                self.pi_scheduler.step()
                self.q_scheduler.step()

                print('Epoch:', epoch, 'Policy_LR:',
                      self.pi_scheduler.get_lr(), 'Critic_LR:',
                      self.q_scheduler.get_lr())

                print('time step: {} , epoch: {} ,time elapsed: {} '.format(
                    t + 1, epoch,
                    time.time() - epoch_start_time))
                train_mean_return, test_score = self.eval_agent(test=False)

                #test_mean_return=self.eval_agent(test=True)
                #print('time_per_epoch',time.time()-epoch_start_time)
                epoch_start_time = time.time()
                print('\n')

                # Save model
                if (epoch % self.save_freq == 0):
                    if test_score < best_score:
                        best_score = test_score
                        print(
                            'Better evaluation score and hence saving model to {}'
                            .format(
                                os.path.join(self.directory, 'model_param/')))
                        torch.save(
                            self.ac.state_dict(),
                            os.path.join(self.directory, 'model_param/') +
                            'checkpoint.pt')

            if (t + 1) % self.steps_per_epoch == 0:
                self.action_time_step = 0

            else:
                self.action_time_step += 1

        return epoch, train_mean_return * (self.batch_size)
예제 #5
0
def main(config):
    env_name = config['run']['env']
    env = gym.make(env_name)
    np.random.seed(config['random_seed'])
    tf.set_random_seed(config['random_seed'])
    env.seed(config['random_seed'])

    batch_size = config['train']['batch_size']
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    # Define action boundaries for continuous but bounded action space
    action_bound = env.action_space.high

    print(f'-------- {env_name} --------')
    print('ACTION SPACE: ', action_dim)
    print('ACTION BOUND: ', action_bound)
    print('STATE SPACE: ', state_dim)
    print(f'------------------------')


    # TODO (20190831, JP): add normalization for envs that require it.
    # Ensure action bound is symmetric - important
    assert (env.action_space.high == -env.action_space.low)

    # Use agent_factory to build the agent using the algorithm specified in the config file.
    Agent = agent_factory(config['agent']['model'])
    agent = Agent(config, state_dim, action_dim, action_bound)

    # Initialize memory for experience replay
    replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'], config['random_seed'])

    print(replay_buffer)
        
    # Set up summary TF operations
    summary_ops, summary_vars = build_summaries()

    with tf.Session() as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        writer = tf.summary.FileWriter(config['output']['summary_dir'], sess.graph)

        # Initialize target network weights
        agent.update_target_networks(sess)

        for i in range(int(config['train']['max_episodes'])):
            s = env.reset()

            episode_reward = 0
            episode_average_max_q = 0

            for j in range(int(config['train']['max_episode_len'])):
                if config['run']['render_env'] == True:
                    env.render()

                # 1. Predict an action to take
                a = agent.actor_predict_action(np.reshape(s, (1, state_dim)), sess)

                # 2. Use action to take step in environment and receive next step, reward, etc.
                s2, r, terminal, info = env.step(a[0])

                # 3. Update the replay buffer with the most recent experience
                replay_buffer.add(np.reshape(s, (state_dim,)), np.reshape(a, (action_dim,)), r,
                                  np.reshape(s2, (state_dim,)), terminal)

                # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences
                if replay_buffer.size() > batch_size:
                    s_batch, a_batch, r_batch, s2_batch, t_batch = replay_buffer.sample_batch(batch_size)

                    # Train current behavioural networks
                    predicted_Q_value = agent.train_networks(s_batch, a_batch, r_batch, s2_batch, t_batch, sess)

                    # Update for logging
                    episode_average_max_q += np.amax(predicted_Q_value)

                    # Update target networks
                    agent.update_target_networks(sess)

                # Update information for next step
                s = s2
                episode_reward += r

                # TODO (20190815, JP): as this could be different for each agent, do
                # agent.summarize_episode(summary_ops, summary_vars, episode_reward, sess) for when each agent requires own summaries?
                if terminal:
                    summary_str = sess.run(summary_ops, feed_dict={
                        summary_vars[0]: episode_reward,
                        summary_vars[1]: episode_average_max_q / float(j)
                        })

                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(episode_reward), i, (episode_average_max_q / float(j))))
                    
                    break

    if config['run']['use_gym_monitor'] == True:
        env.monitor.close()