示例#1
0
 def __init__(self, agent, env, args):
     self.agent = agent
     self.env = env
     self.args = args
     self.global_timestep = tf.train.get_or_create_global_step()
     self.args.max_episode_steps = args.max_episode_steps if args.max_episode_steps else 27000  # borrow from dopamine
     self.summary_writer = tf.contrib.summary.create_file_writer(args.log_dir)
     # TODO: maybe we don't need to define session.
     self.sess = self._get_session(eager=True)
     self.log = logger(args)
示例#2
0
tf.random.set_random_seed(params.seed)

replay_buffer = ReplayBuffer(params.memory_size)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)
random_process = OrnsteinUhlenbeckProcess(size=env.action_space.shape[0],
                                          theta=0.15,
                                          mu=params.mu,
                                          sigma=params.sigma)
agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params)

get_ready(agent.params)

global_timestep = tf.compat.v1.train.get_or_create_global_step()
time_buffer = deque(maxlen=agent.params.reward_buffer_ep)
log = logger(agent.params)

traj = list()

with summary_writer.as_default():
    # for summary purpose, we put all codes in this context
    with tf.contrib.summary.always_record_summaries():

        for i in itertools.count():
            state = env.reset()
            total_reward = 0
            self_rewards = 0
            start = time.time()
            agent.random_process.reset_states()
            done = False
            episode_len = 0
示例#3
0
def train_DQN(agent, env, policy, replay_buffer, reward_buffer, params,
              summary_writer):
    """
    Train DQN agent which defined above

    :param main_model:
    :param target_model:
    :param env:
    :param params:
    :return:
    """

    # Create a glboal step variable
    # global_step = tf.Variable(0, name='global_step', trainable=False)

    # log purpose
    losses, all_rewards, cnt_action = [], [], []
    episode_reward, index_episode = 0, 0
    log = logger(params)

    with tf.Session() as sess:
        # initialise all variables used in the model
        sess.run(tf.global_variables_initializer())
        global_step = sess.run(tf.train.get_or_create_global_step())
        state = env.reset()
        start = time.time()
        for frame_idx in range(1, params.num_frames + 1):
            action = policy.select_action(sess, agent.main_model,
                                          state.reshape(params.state_reshape))
            cnt_action.append(action)
            next_state, reward, done, _ = env.step(action)

            replay_buffer.add(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward
            global_step += 1

            if done:
                index_episode += 1
                policy.index_episode = index_episode
                state = env.reset()
                all_rewards.append(episode_reward)

                if frame_idx > params.learning_start and len(
                        replay_buffer) > params.batch_size:
                    states, actions, rewards, next_states, dones = replay_buffer.sample(
                        params.batch_size)
                    next_Q = agent.target_model.predict(sess, next_states)
                    # Y = rewards + params.gamma * np.max(next_Q, axis=1)
                    Y = rewards + params.gamma * np.max(
                        next_Q, axis=1) * np.logical_not(dones)
                    loss = agent.main_model.update(sess, states, actions, Y)

                    # Logging and refreshing log purpose values
                    losses.append(loss)
                    log.logging(frame_idx, params.num_frames, index_episode,
                                time.time() - start, episode_reward,
                                np.mean(loss), policy.current_epsilon(),
                                cnt_action)

                episode_reward = 0
                cnt_action = []
                start = time.time()

                if np.random.rand() > 0.5:
                    # soft update means we partially add the original weights of target model instead of completely
                    # sharing the weights among main and target models
                    if params.update_hard_or_soft == "hard":
                        sync_main_target(sess, agent.target_model,
                                         agent.main_model)
                    elif params.update_hard_or_soft == "soft":
                        soft_target_model_update(sess,
                                                 agent.target_model,
                                                 agent.main_model,
                                                 tau=params.soft_update_tau)

    # test(sess, main_model, env, params)

    return all_rewards, losses
示例#4
0
def train(agent, env, replay_buffer, reward_buffer, summary_writer,
          num_eval_episodes, num_frames, tau, eval_interval, hot_start,
          batch_size, interval_MAR, log_dir, google_colab):
    time_buffer = list()
    log = logger(num_frames=num_frames, interval_MAR=interval_MAR)
    with summary_writer.as_default():
        tf.compat.v2.summary.text(name="Hyper-params",
                                  data=params_to_markdown(
                                      gin.operative_config_str()),
                                  step=0)
        for epoch in itertools.count():
            state = env.reset()
            total_reward = 0
            start = time.time()
            agent.random_process.reset_states()
            done = False
            episode_len = 0
            while not done:
                if agent.global_ts.numpy() < hot_start:
                    action = env.action_space.sample()
                else:
                    action = agent.select_action(state)

                # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict)
                next_state, reward, done, info = env.step(
                    action * env.action_space.high)
                replay_buffer.add(state, action, reward, next_state, done)
                """
                === Update the models
                """
                if agent.global_ts.numpy() > hot_start:
                    states, actions, rewards, next_states, dones = replay_buffer.sample(
                        batch_size)
                    loss = agent.update(states, actions, rewards, next_states,
                                        dones)
                    soft_target_model_update_eager(agent.target_actor,
                                                   agent.actor,
                                                   tau=tau)
                    soft_target_model_update_eager(agent.target_critic,
                                                   agent.critic,
                                                   tau=tau)

                agent.global_ts.assign_add(1)
                episode_len += 1
                total_reward += reward
                state = next_state

                # for evaluation purpose
                if agent.global_ts.numpy() % eval_interval == 0:
                    agent.eval_flg = True
            """
            ===== After 1 Episode is Done =====
            """
            # save the updated models
            agent.actor_manager.save()
            agent.critic_manager.save()

            # store the episode related variables
            reward_buffer.append(total_reward)
            time_buffer.append(time.time() - start)

            # logging on Tensorboard
            ts = agent.global_ts.numpy()
            tf.compat.v2.summary.scalar("train/reward", total_reward, step=ts)
            tf.compat.v2.summary.scalar("train/exec_time",
                                        time.time() - start,
                                        step=ts)
            if ts > hot_start:
                tf.compat.v2.summary.scalar("train/MAR",
                                            np.mean(reward_buffer),
                                            step=ts)

            # logging
            if ts > hot_start and epoch % interval_MAR == 0:
                log.logging(time_step=ts,
                            exec_time=np.sum(time_buffer),
                            reward_buffer=reward_buffer,
                            epsilon=0)
                time_buffer = list()

            if agent.eval_flg:
                score = eval_Agent(agent,
                                   env,
                                   log_dir=log_dir,
                                   google_colab=google_colab)
                tf.compat.v2.summary.scalar("eval/Score", score, step=ts)
                agent.eval_flg = False

            # check the stopping condition
            if ts >= num_frames:
                print("=== Training is Done ===")
                score = eval_Agent(agent,
                                   env,
                                   n_trial=num_eval_episodes,
                                   log_dir=log_dir,
                                   google_colab=google_colab)
                tf.compat.v2.summary.scalar("eval/Score", score, step=ts)
                env.close()
                break
示例#5
0
def train(global_timestep, agent, env, replay_buffer, reward_buffer,
          summary_writer, num_eval_episodes, num_frames, eval_interval,
          hot_start, train_freq, batch_size, sync_freq, interval_MAR):
    time_buffer = list()
    log = logger(num_frames=num_frames, interval_MAR=interval_MAR)
    for epoch in itertools.count():
        state = np.array(env.reset())
        total_reward = 0
        start = time.time()
        cnt_action = list()
        done = False
        while not done:
            action = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.array(next_state)
            replay_buffer.add(state, action, reward, next_state, done)

            global_timestep += 1
            agent.timestep = global_timestep
            total_reward += reward
            state = next_state
            cnt_action.append(action)

            # for evaluation purpose
            if global_timestep % eval_interval == 0:
                agent.eval_flg = True

            if (global_timestep > hot_start) and (global_timestep % train_freq
                                                  == 0):
                states, actions, rewards, next_states, dones = replay_buffer.sample(
                    batch_size)
                agent.update(states, actions, rewards, next_states, dones)

            # synchronise the target and main models by hard
            if (global_timestep > hot_start) and (global_timestep % sync_freq
                                                  == 0):
                agent.save()
                agent.sync_network()
        """
        ===== After 1 Episode is Done =====
        """
        summary_writer.add_scalar("train/reward", total_reward,
                                  global_timestep)
        summary_writer.add_scalar("train/exec_time",
                                  time.time() - start, global_timestep)
        if global_timestep > hot_start:
            summary_writer.add_scalar("train/MAR", np.mean(reward_buffer),
                                      global_timestep)
        summary_writer.add_histogram("train/taken actions",
                                     np.array(cnt_action), global_timestep)

        # store the episode reward
        reward_buffer.append(total_reward)
        time_buffer.append(time.time() - start)

        if global_timestep > hot_start and epoch % interval_MAR == 0:
            log.logging(time_step=global_timestep,
                        exec_time=np.sum(time_buffer),
                        reward_buffer=reward_buffer,
                        epsilon=agent.policy.current_epsilon(global_timestep))
            time_buffer = list()

        if agent.eval_flg:
            # replay_buffer.save()
            score = eval_Agent(agent, env)
            summary_writer.add_scalar("eval/Score", score, global_timestep)
            agent.eval_flg = False

        # check the stopping condition
        if global_timestep >= num_frames:
            print("=== Training is Done ===")
            score = eval_Agent(agent, env, n_trial=num_eval_episodes)
            summary_writer.add_scalar("eval/Score", score, global_timestep)
            env.close()
            break
示例#6
0
def train(global_timestep, agent, env, replay_buffer, reward_buffer,
          summary_writer, num_eval_episodes, num_frames, eval_interval,
          hot_start, train_freq, batch_size, sync_freq, interval_MAR, log_dir,
          google_colab):
    time_buffer = list()
    log = logger(num_frames=num_frames, interval_MAR=interval_MAR)
    with summary_writer.as_default():
        tf.compat.v2.summary.text(name="Hyper-params",
                                  data=params_to_markdown(
                                      gin.operative_config_str()),
                                  step=0)
        for epoch in itertools.count():
            state = env.reset()
            total_reward = 0
            start = time.time()
            cnt_action = list()
            done = False
            while not done:
                action = agent.select_action(state)
                next_state, reward, done, info = env.step(action)
                replay_buffer.add(state, action, reward, next_state, done)

                global_timestep.assign_add(1)
                total_reward += reward
                state = next_state
                cnt_action.append(action)

                # for evaluation purpose
                if global_timestep.numpy() % eval_interval == 0:
                    agent.eval_flg = True

                if (global_timestep.numpy() > hot_start) and (
                        global_timestep.numpy() % train_freq == 0):
                    states, actions, rewards, next_states, dones = replay_buffer.sample(
                        batch_size)
                    agent.update(states, actions, rewards, next_states, dones)

                # synchronise the target and main models by hard
                if (global_timestep.numpy() > hot_start) and (
                        global_timestep.numpy() % sync_freq == 0):
                    agent.manager.save()
                    agent.target_model.set_weights(
                        agent.main_model.get_weights())
            """
            ===== After 1 Episode is Done =====
            """
            ts = global_timestep.numpy()
            tf.compat.v2.summary.scalar("train/reward", total_reward, step=ts)
            tf.compat.v2.summary.scalar("train/exec_time",
                                        time.time() - start,
                                        step=ts)
            if ts > hot_start:
                tf.compat.v2.summary.scalar("train/MAR",
                                            np.mean(reward_buffer),
                                            step=ts)
            tf.compat.v2.summary.histogram("train/taken actions",
                                           cnt_action,
                                           step=ts)

            # store the episode reward
            reward_buffer.append(total_reward)
            time_buffer.append(time.time() - start)

            if ts > hot_start and epoch % interval_MAR == 0:
                log.logging(time_step=ts,
                            exec_time=np.sum(time_buffer),
                            reward_buffer=reward_buffer,
                            epsilon=agent.policy.current_epsilon())
                time_buffer = list()

            if agent.eval_flg:
                # replay_buffer.save()
                score = eval_Agent(agent,
                                   env,
                                   log_dir=log_dir,
                                   google_colab=google_colab)
                tf.compat.v2.summary.scalar("eval/Score", score, step=ts)
                agent.eval_flg = False

            # check the stopping condition
            if ts >= num_frames:
                print("=== Training is Done ===")
                score = eval_Agent(agent,
                                   env,
                                   n_trial=num_eval_episodes,
                                   log_dir=log_dir,
                                   google_colab=google_colab)
                tf.compat.v2.summary.scalar("eval/Score", score, step=ts)
                env.close()
                break