Exemplo n.º 1
0
class Athlete(object):
    def __init__(self,
                 environment_name="CartPole-v1",
                 replay_memory_size=10000,
                 action_threshold=0.7,
                 batch_size=64,
                 gamma=0.9):
        self.environment = gym.make(environment_name)
        state = self.environment.reset()
        self.state_shape = state.shape
        self.action_space = self.environment.action_space.n
        self.replay_memory = ReplayMemory(self.state_shape,
                                          capacity=replay_memory_size)
        self.model = self.build_network()
        self.target_model = self.build_network()
        self.action_threshold = action_threshold
        self.batch_size = batch_size
        self.gamma = gamma

    def build_network(self) -> tf.keras.Model:
        yield NotImplemented()

    def choose_action(self, state: np.ndarray, threshold: float):
        if random.random() > threshold:
            # 随机取结果
            action = random.randint(0, self.action_space - 1)
        else:
            # 模型取结果
            results = self.model.predict(state.reshape([1] +
                                                       list(state.shape)))
            action = np.argmax(results, 1)[0]
        return action

    def simulate(self, action_threshold: float):
        state = self.environment.reset()
        while not self.replay_memory.is_full:
            action = self.choose_action(state, action_threshold)
            state_after, reward, done, _ = self.environment.step(action)
            self.replay_memory.add(state, action, reward, done, state_after)
            state = state_after
            if done:
                state = self.environment.reset()

        return True

    def train(self, epoch=100, model_prefix="saved_models/model"):
        model_prefix = model_prefix + ".epoch_{}.score_{}.h5"
        self.model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01),
                           loss=tf.losses.mean_squared_error)
        for i in range(epoch):
            print("Epoch {} running:...".format(i))
            self.target_model.set_weights(self.model.get_weights())
            self.replay_memory.reset()
            self.simulate(self.action_threshold)
            self.replay_memory.compute_estimated_q(self.target_model,
                                                   self.gamma)
            num_batches = self.replay_memory.length / self.batch_size
            for j in range(int(num_batches)):
                states, actions, rewards, dones, next_states, estimated_q = self.replay_memory.random_batch(
                    self.batch_size)
                self.model.fit(states, estimated_q, epochs=1, verbose=0)

            if i % 5 == 0:
                score = self.estimate_model(self.model, render=False)
                model_path = model_prefix.format(i, score)
                print("Saving model: {} ...".format(model_path))
                self.model.save(model_prefix.format(i, score))

    def estimate_model(self, model=None, model_path="", render=True):
        if not model:
            model: tf.keras.Model = tf.keras.models.load_model(model_path)
        state = self.environment.reset()
        reward_count = 0
        while True:
            action = model.predict(
                state.reshape([
                    1,
                ] + list(self.state_shape)))
            print(state)
            print(action)
            action = np.argmax(action, 1)[0]
            print(action)
            if render:
                time.sleep(0.05)
                self.environment.render()
            state_after, revard, done, _ = self.environment.step(action)
            reward_count += revard
            if done:
                break
            state = state_after

        print("Steps taken: ", reward_count)
        return reward_count

    def score_model(self, model=None, model_path="", num_iteration=10):
        if not model:
            model: tf.keras.Model = tf.keras.models.load_model(model_path)

        scores = []
        for i in range(num_iteration):
            score = self.estimate_model(model)
            scores.append(score)
        avg_score = sum(scores) / num_iteration
        return avg_score
Exemplo n.º 2
0
class MotionAthlete(Athlete):
    def __init__(self,
                 environment_name="Acrobot-v1",
                 replay_memory_size=10000,
                 action_threshold=0.7,
                 batch_size=64,
                 gamma=0.9):
        super(MotionAthlete,
              self).__init__(environment_name, replay_memory_size,
                             action_threshold, batch_size, gamma)
        self.environment.close()
        del self.environment
        self.environment = EnvironmentWrapper(environment_name)
        frame = self.environment.reset()
        frmae_shape = frame.shape
        self.motion_tracer = MotionTracer(frame_shape=frmae_shape)
        self.state_shape = self.motion_tracer.state_shape
        self.replay_memory = ReplayMemory(self.state_shape,
                                          capacity=replay_memory_size)
        del self.model
        del self.target_model
        self.model = self.build_network()
        self.target_model = self.build_network()

    def simulate(self, action_threshold: float):
        print("Simulating...")
        frame = self.environment.reset()
        self.motion_tracer.reset()
        self.motion_tracer.add_frame(frame)
        while not self.replay_memory.is_full:
            state = self.motion_tracer.get_state()
            action = self.choose_action(state, action_threshold)
            frame_after, reward, done, _ = self.environment.step(action)
            self.motion_tracer.add_frame(frame_after)
            state_next = self.motion_tracer.get_state()
            self.replay_memory.add(state, action, reward, done, state_next)
            if done:
                frame = self.environment.reset()
                self.motion_tracer.reset()
                self.motion_tracer.add_frame(frame)
        print("Simulation finished")

        return True

    def estimate_model(self, model=None, model_path="", render=True):
        if not model:
            model: tf.keras.Model = tf.keras.models.load_model(model_path)
        frame = self.environment.reset()
        self.motion_tracer.reset()
        self.motion_tracer.add_frame(frame)
        state = self.motion_tracer.get_state()
        reward_count = 0
        step_count = 0
        while True:
            step_count += 1
            action = model.predict(
                state.reshape([
                    1,
                ] + list(self.state_shape)))
            print(frame)
            print(action)
            action = np.argmax(action, 1)[0]
            print(action)
            if render:
                time.sleep(0.05)
                self.environment.render()
            frame_after, revard, done, _ = self.environment.step(action)
            reward_count += revard
            if done:
                break
            self.motion_tracer.add_frame(frame_after)
            state = self.motion_tracer.get_state()

        print("Total reward: ", reward_count)
        print("Total step: ", step_count)
        return reward_count
Exemplo n.º 3
0
def train_model(env,
                conv_layers,
                learning_rate=5e-4,
                total_timesteps=100000,
                buffer_size=50000,
                exploration_fraction=0.1,
                exploration_final_eps=0.02,
                train_freq=1,
                batch_size=32,
                print_freq=1,
                checkpoint_freq=100000,
                checkpoint_path=None,
                learning_starts=1000,
                gamma=1.0,
                target_network_update_freq=500,
                double_dqn=False,
                **network_kwargs) -> tf.keras.Model:
    """Train a DQN model.

    Parameters
    -------
    env: gym.Env
        openai gym
    conv_layers: list
        a list of triples that defines the conv network
    learning_rate: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to run the environment
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every train_freq steps.
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to store a checkpoint during training
    checkpoint_path: str
        the fs path for storing the checkpoints
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    double_dqn: bool
        specifies if double q-learning is used during training
    Returns
    -------
    dqn: an instance of tf.Module that contains the trained model
    """
    q_func = build_dueling_q_func(conv_layers, **network_kwargs)

    dqn = DeepQ(model_builder=q_func,
                observation_shape=env.observation_space.shape,
                num_actions=env.action_space.n,
                learning_rate=learning_rate,
                gamma=gamma,
                double_dqn=double_dqn)

    manager = None
    if checkpoint_path is not None:
        load_path = osp.expanduser(checkpoint_path)
        ckpt = tf.train.Checkpoint(model=dqn.q_network)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5)
        ckpt.restore(manager.latest_checkpoint)
        print("Restoring from {}".format(manager.latest_checkpoint))

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Create the replay buffer
    replay_buffer = ReplayMemory(buffer_size)
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(total_timesteps=int(exploration_fraction *
                                                     total_timesteps),
                                 initial_prob=1.0,
                                 final_prob=exploration_final_eps)

    dqn.update_target()

    episode_rewards = [0.0]
    obs = env.reset()

    obs = np.expand_dims(np.array(obs), axis=0)

    for t in range(total_timesteps):
        update_eps = exploration.step_to(t)

        action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps)
        action = action[0].numpy()

        new_obs, reward, done, _ = env.step(action)
        # Store transition in the replay buffer.
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        replay_buffer.add(obs[0], action, reward, new_obs[0], float(done))
        obs = new_obs

        episode_rewards[-1] += reward
        if done:
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, _ = tf.ones_like(rewards), None

            td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones,
                                weights)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network every target_network_update_freq steps
            dqn.update_target()

        reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1)
        number_episodes = len(episode_rewards) - 1
        if done and print_freq is not None and number_episodes % print_freq == 0:
            format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}"
            print(
                format_str.format(t, number_episodes, reward_100_mean,
                                  episode_rewards[-2],
                                  int(100 * exploration.value(t))))

            with train_summary_writer.as_default():
                tf.summary.scalar('loss',
                                  dqn.train_loss_metrics.result(),
                                  step=t)
                tf.summary.scalar('reward', episode_rewards[-2], step=t)

        if checkpoint_path is not None and t % checkpoint_freq == 0:
            manager.save()

        # Every training step, reset the loss metric
        dqn.train_loss_metrics.reset_states()

    return dqn.q_network