示例#1
0
文件: agent.py 项目: andyrdt/atari-rl
  def __init__(self, policy_network, replay_memory, summary, config):
    self.config = config
    self.policy_network = policy_network
    self.replay_memory = replay_memory
    self.summary = summary

    # Create environment
    self.atari = Atari(summary, config)
    self.exploration_bonus = ExplorationBonus(config)
示例#2
0
def create_config():
    config = flags.FLAGS
    config.game = '_'.join(
        [g.lower() for g in re.findall('[A-Z]?[a-z]+', config.game)])
    config.num_actions = Atari.num_actions(config)
    config.frameskip = eval(str(config.frameskip))
    config.input_shape = eval(str(config.input_shape))
    config.exploration_frame_shape = eval(str(config.exploration_frame_shape))
    config.reward_clipping = config.reward_clipping and not config.reward_scaling
    config.run_dir = util.run_directory(config)

    if not config.bootstrapped: config.num_bootstrap_heads = 1

    if config. async is None:
        config.num_threads = 1
示例#3
0
 def eval_by_frames(self):
     rewards = list()
     reward = 0.
     env = Atari(self.env_id)
     state = env.reset()
     with torch.cuda.stream(self.cuda_eval):
         for step in range(self.eval_frames // 4):
             action = self.agent.policy(
                 np.expand_dims(state, 0),
                 training=False,
                 eps=self.eps_eval,
                 return_streams=False,
             )[0]
             state, r, terminal, _ = env.step(action)
             reward += r
             if terminal:
                 rewards.append(reward)
                 reward = 0.
                 state = env.reset()
     env.close()
     return np.mean(rewards)
示例#4
0
 def eval_by_episodes(self):
     n_trials = self.eval_episodes
     envs = [Atari(self.env_id) for _ in range(n_trials)]
     states = np.stack([u.reset() for u in envs])
     actions = np.empty(n_trials, dtype=np.int)
     reward = np.zeros(n_trials, dtype=np.float32)
     terminal = np.zeros(n_trials, dtype=np.bool)
     with torch.cuda.stream(self.cuda_eval):
         while not terminal.all():
             not_t = ~terminal
             actions[not_t] = self.agent.policy(
                 states=states[not_t],
                 training=False,
                 eps=self.eps_eval,
                 return_streams=False,
             )
             for i, nt in enumerate(not_t):
                 if nt:
                     states[i], r, terminal[i], _ = envs[i].step(actions[i])
                     reward[i] += r
     for e in envs:
         e.close()
     return np.mean(reward)
示例#5
0
文件: agent.py 项目: andyrdt/atari-rl
class Agent(object):
  def __init__(self, policy_network, replay_memory, summary, config):
    self.config = config
    self.policy_network = policy_network
    self.replay_memory = replay_memory
    self.summary = summary

    # Create environment
    self.atari = Atari(summary, config)
    self.exploration_bonus = ExplorationBonus(config)

  def new_game(self):
    self.policy_network.sample_head()
    observation, reward, done = self.atari.reset()
    self.replay_memory.store_new_episode(observation)
    return observation, reward, done

  def action(self, session, step, observation):
    # Epsilon greedy exploration/exploitation even for bootstrapped DQN
    if np.random.rand() < self.epsilon(step):
      return self.atari.sample_action()
    else:
      [action] = session.run(
          self.policy_network.choose_action,
          {self.policy_network.inputs.observations: [observation]})
      return action

  def get_action_values(self, session, step, observation):
      return session.run(self.policy_network.eval_actions,{self.policy_network.inputs.observations: [observation]})

  def get_ram_state(self):
      return self.atari.env._get_ram()
  def get_full_frame(self):
      return self.atari.env._get_image()

  def epsilon(self, step):
    """Epsilon is linearly annealed from an initial exploration value
    to a final exploration value over a number of steps"""

    initial = self.config.initial_exploration
    final = self.config.final_exploration
    final_frame = self.config.final_exploration_frame

    annealing_rate = (initial - final) / final_frame
    annealed_exploration = initial - (step * annealing_rate)
    epsilon = max(annealed_exploration, final)

    self.summary.epsilon(step, epsilon)

    return epsilon

  def take_action(self, action):
    observation, reward, done = self.atari.step(action)
    training_reward = self.process_reward(reward, observation)

    # Store action, reward and done with the next observation
    self.replay_memory.store_transition(action, training_reward, done,
                                        observation)

    return observation, reward, done

  def process_reward(self, reward, frames):
    if self.config.exploration_bonus:
      reward += self.exploration_bonus.bonus(frames)

    if self.config.reward_clipping:
      reward = max(-self.config.reward_clipping,
                   min(reward, self.config.reward_clipping))

    return reward

  def populate_replay_memory(self):
    """Play game with random actions to populate the replay memory"""

    count = 0
    done = True

    while count < self.config.replay_start_size or not done:
      if done: self.new_game()
      _, _, done = self.take_action(self.atari.sample_action())
      count += 1

    self.atari.episode = 0

  def log_episode(self, step):
    self.atari.log_episode(step)
示例#6
0
文件: dqn.py 项目: crizCraig/dqn-1
def go(solver_filename, start_iter):
    check_for_test_vars()
    start_timestamp = int(time.time())
    log_file_name = get_episode_log_filename(start_timestamp)
    utils.setup_matplotlib()
    solver = utils.get_solver(solver_filename)
    net = solver.net
    frame_dir_name = get_frame_dir_name(start_timestamp)
    os.makedirs(frame_dir_name)
    episode_count = 0
    atari = Atari(frame_dir_name, episode_count, start_timestamp, show_game())
    action = actions.MOVE_RIGHT_AND_FIRE
    episode_stats = EpisodeStats()
    dqn = DqnSolver(atari, net, solver, start_timestamp, start_iter)
    while dqn.iter < xrange(int(1E7)):  # 10 million training steps

        time1 = time.time()
        experience = atari.experience(EXPERIENCE_WINDOW_SIZE, action)
        time2 = time.time()
        print '%s function took %0.3f ms' % \
              ('experience', (time2 - time1) * 1000.0)

        time1 = time.time()
        q, action = dqn.perceive(experience)
        time2 = time.time()
        print '%s function took %0.3f ms' %\
              ('perceive', (time2 - time1) * 1000.0)

        time1 = time.time()
        exploit = dqn.should_exploit()
        time2 = time.time()
        print '%s function took %0.3f ms' %\
              ('should-exploit', (time2 - time1) * 1000.0)

        if not exploit:
            action = actions.get_random_action()

        time1 = time.time()
        episode_stat = dqn.learn_from_experience_replay()
        time2 = time.time()
        print '%s function took %0.3f ms' %\
              ('learn', (time2 - time1) * 1000.0)

        time1 = time.time()
        dqn.record_episode_stats(episode_stats, experience, q, action, exploit,
                                 episode_stat)
        time2 = time.time()
        print '%s function took %0.3f ms' %\
              ('record', (time2 - time1) * 1000.0)

        if atari.game_over or 'TEST_AFTER_GAME' in os.environ:
            EpisodeStats.log_csv(episode_count, episode_stats, log_file_name)
            episode_count += 1
            episode_stats = EpisodeStats()
            atari.stop()
            if 'TEST_AFTER_GAME' in os.environ:
                return
            atari = Atari(frame_dir_name, episode_count, start_timestamp,
                          show_game())
        dqn.iter += 1
        print 'dqn iteration: ', dqn.iter
示例#7
0
# (1,1,512).
# Cкорость обучения для оптимизатора Adam
LEARNING_RATE = 0.00001  # Set to 0.00025 in Pong for quicker results.
TAU = 0.08  # The merging rate of the weight values between the primary and target networks
# Hessel et al. 2017 used 0.0000625
# Размер пачки для обучения
BS = 32  # Batch size
# For compatibility
PATH = "output/"  # Gifs and checkpoints will be saved here
SUMMARIES = "summaries"  # logdir for tensorboard
RUNID = 'run_1'
os.makedirs(PATH, exist_ok=True)
# os.makedirs(os.path.join(SUMMARIES, RUNID), exist_ok=True)
# SUMM_WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUNID))

atari = Atari(ENV_NAME, NO_OP_STEPS)

print("The environment has the following {} actions: {}".format(
    atari.env.action_space.n, atari.env.unwrapped.get_action_meanings()))
# input_shape = (BS, 84, 84, 4)
MAIN_DQN = MyModel(atari.env.action_space.n, learning_rate=LEARNING_RATE)
MAIN_DQN.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss=tf.keras.losses.Huber())
# MAIN_DQN(np.zeros(input_shape))             # build
# MAIN_DQN.summary()                              # and show summary

TARGET_DQN = MyModel(atari.env.action_space.n, learning_rate=LEARNING_RATE)
TARGET_DQN.compile(optimizer=tf.keras.optimizers.Adam(),
                   loss=tf.keras.losses.Huber())
#_ = TARGET_DQN(np.zeros(input_shape))             # build
示例#8
0
class Agent(object):
  def __init__(self, policy_network, replay_memory, summary, config):
    self.config = config
    self.policy_network = policy_network
    self.replay_memory = replay_memory
    self.summary = summary

    # Create environment
    self.atari = Atari(summary, config)
    self.exploration_bonus = ExplorationBonus(config)

  def new_game(self):
    self.policy_network.sample_head()
    observation, reward, done = self.atari.reset()
    self.replay_memory.store_new_episode(observation)
    return observation, reward, done

  def action(self, session, step, observation):
    # Epsilon greedy exploration/exploitation even for bootstrapped DQN
    if self.config.LLL:
      [e_vals, vals] = session.run(
          [self.policy_network.action_values, self.policy_network.action_e_values],
          {self.policy_network.inputs.observations: [observation],
           self.policy_network.inputs.alive: np.reshape([1],(1,1))})
      return np.argmax(vals - self.epsilon(step) * np.log(-np.log(e_vals)))

    elif np.random.rand() < self.epsilon(step):
      return self.atari.sample_action()
    else:
      [action] = session.run(
          self.policy_network.choose_action,
          {self.policy_network.inputs.observations: [observation]})
      return action

  def epsilon(self, step):
    """Epsilon is linearly annealed from an initial exploration value
    to a final exploration value over a number of steps"""

    initial = self.config.initial_exploration
    final = self.config.final_exploration
    final_frame = self.config.final_exploration_frame

    annealing_rate = (initial - final) / final_frame
    annealed_exploration = initial - (step * annealing_rate)
    epsilon = max(annealed_exploration, final)

    self.summary.epsilon(step, epsilon)

    return epsilon

  def take_action(self, action, last_observation=None, session=None):


    observation, reward, done = self.atari.step(action)

    if self.config.e_exploration_bonus:
      if session is None:
        e_value = 0.5

      elif self.config.actor_critic:
        [e_value] = session.run(
            self.policy_network.evalue,
            {self.policy_network.inputs.observations: [observation],
             self.policy_network.inputs.alive: np.reshape([1],(1,1))})
        e_value = e_value*-1

      else:
        [e_value] = session.run(
            self.policy_network.taken_action_e_value,
            {self.policy_network.inputs.observations: [last_observation],
             self.policy_network.inputs.action: np.reshape([action],(1,1)),
             self.policy_network.inputs.alive: np.reshape([1],(1,1))})
    else:
      e_value = 0

    training_reward = self.process_reward(reward, observation, e_value)

    # Store action, reward and done with the next observation
    self.replay_memory.store_transition(action, training_reward, done,
                                        observation)

    return observation, reward, done

  def process_reward(self, reward, frames, e_value):
    if self.config.exploration_bonus:
      reward += self.exploration_bonus.bonus(frames)

    if self.config.e_exploration_bonus:
        counter = -np.log(e_value)
        exploration_bonus = self.config.exploration_beta / ((counter + 0.01)**0.5)
        reward += exploration_bonus

    if self.config.reward_clipping:
      reward = max(-self.config.reward_clipping,
                   min(reward, self.config.reward_clipping))

    return reward

  def populate_replay_memory(self):
    """Play game with random actions to populate the replay memory"""

    count = 0
    done = True

    while count < self.config.replay_start_size or not done:
      if done: self.new_game()
      _, _, done = self.take_action(self.atari.sample_action())
      count += 1

    self.atari.episode = 0

  def log_episode(self, step):
    self.atari.log_episode(step)
示例#9
0
# (1,1,512).
# Cкорость обучения для оптимизатора Adam
LEARNING_RATE = 0.00001          # Set to 0.00025 in Pong for quicker results.
TAU = 0.08                       # The merging rate of the weight values between the primary and target networks
# Hessel et al. 2017 used 0.0000625
# Размер пачки для обучения
BS = 32                          # Batch size
# For compatibility
PATH = "output/"                 # Gifs and checkpoints will be saved here
SUMMARIES = "summaries"          # logdir for tensorboard
RUNID = 'run_1'
os.makedirs(PATH, exist_ok=True)
# os.makedirs(os.path.join(SUMMARIES, RUNID), exist_ok=True)
# SUMM_WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUNID))

atari = Atari(ENV_NAME, NO_OP_STEPS)

print("The environment has the following {} actions: {}".format(atari.env.action_space.n,
                                                                atari.env.unwrapped.get_action_meanings()))

BASE_DIR = os.path.join('c:\\Python\\gymgames\\', 'DQNMODEL')

if isinstance(atari.env.observation_space, Box):
    s_dim = atari.env.observation_space.shape[0] if len(atari.env.observation_space.shape) == 1 else 0
else:
    s_dim = int(atari.env.observation_space.n)

if len(atari.env.observation_space.shape) == 3:
    visual_sources = 1
    visual_resolution = list(atari.env.observation_space.shape)
    visual_resolution = [84, 84, 4]
示例#10
0
from dqn import *
from atari import Atari

T = 10000
UPDATE_TIME = 100

if __name__ == '__main__':
    atari = Atari('breakout.bin')
    actions = atari.legal_actions
    dqn = DQN(actions)
    state = atari.newGame()
    state = np.stack((state, state, state, state), axis=2).reshape((84, 84, 4))
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())
    for _ in range(T):
        action = dqn.selectAction(state)

        next_state, reward, game_over = atari.next(action)
        next_state = np.append(next_state, state, axis=2)[:, :, 1:]
        dqn.storeExperience(state, action, reward, next_state, game_over)

        minibatch = dqn.sampleExperiences()
        state_batch = [experience[0] for experience in minibatch]
        nextState_batch = [experience[3] for experience in minibatch]
        action_batch = [experience[1] for experience in minibatch]
        terminal_batch = [experience[4] for experience in minibatch]
        reward_batch = [experience[2] for experience in minibatch]

        y_batch = []
        Q_batch = sess.run(
            dqn.targetQNet.QValue,