def __init__(self, policy_network, replay_memory, summary, config): self.config = config self.policy_network = policy_network self.replay_memory = replay_memory self.summary = summary # Create environment self.atari = Atari(summary, config) self.exploration_bonus = ExplorationBonus(config)
def create_config(): config = flags.FLAGS config.game = '_'.join( [g.lower() for g in re.findall('[A-Z]?[a-z]+', config.game)]) config.num_actions = Atari.num_actions(config) config.frameskip = eval(str(config.frameskip)) config.input_shape = eval(str(config.input_shape)) config.exploration_frame_shape = eval(str(config.exploration_frame_shape)) config.reward_clipping = config.reward_clipping and not config.reward_scaling config.run_dir = util.run_directory(config) if not config.bootstrapped: config.num_bootstrap_heads = 1 if config. async is None: config.num_threads = 1
def eval_by_frames(self): rewards = list() reward = 0. env = Atari(self.env_id) state = env.reset() with torch.cuda.stream(self.cuda_eval): for step in range(self.eval_frames // 4): action = self.agent.policy( np.expand_dims(state, 0), training=False, eps=self.eps_eval, return_streams=False, )[0] state, r, terminal, _ = env.step(action) reward += r if terminal: rewards.append(reward) reward = 0. state = env.reset() env.close() return np.mean(rewards)
def eval_by_episodes(self): n_trials = self.eval_episodes envs = [Atari(self.env_id) for _ in range(n_trials)] states = np.stack([u.reset() for u in envs]) actions = np.empty(n_trials, dtype=np.int) reward = np.zeros(n_trials, dtype=np.float32) terminal = np.zeros(n_trials, dtype=np.bool) with torch.cuda.stream(self.cuda_eval): while not terminal.all(): not_t = ~terminal actions[not_t] = self.agent.policy( states=states[not_t], training=False, eps=self.eps_eval, return_streams=False, ) for i, nt in enumerate(not_t): if nt: states[i], r, terminal[i], _ = envs[i].step(actions[i]) reward[i] += r for e in envs: e.close() return np.mean(reward)
class Agent(object): def __init__(self, policy_network, replay_memory, summary, config): self.config = config self.policy_network = policy_network self.replay_memory = replay_memory self.summary = summary # Create environment self.atari = Atari(summary, config) self.exploration_bonus = ExplorationBonus(config) def new_game(self): self.policy_network.sample_head() observation, reward, done = self.atari.reset() self.replay_memory.store_new_episode(observation) return observation, reward, done def action(self, session, step, observation): # Epsilon greedy exploration/exploitation even for bootstrapped DQN if np.random.rand() < self.epsilon(step): return self.atari.sample_action() else: [action] = session.run( self.policy_network.choose_action, {self.policy_network.inputs.observations: [observation]}) return action def get_action_values(self, session, step, observation): return session.run(self.policy_network.eval_actions,{self.policy_network.inputs.observations: [observation]}) def get_ram_state(self): return self.atari.env._get_ram() def get_full_frame(self): return self.atari.env._get_image() def epsilon(self, step): """Epsilon is linearly annealed from an initial exploration value to a final exploration value over a number of steps""" initial = self.config.initial_exploration final = self.config.final_exploration final_frame = self.config.final_exploration_frame annealing_rate = (initial - final) / final_frame annealed_exploration = initial - (step * annealing_rate) epsilon = max(annealed_exploration, final) self.summary.epsilon(step, epsilon) return epsilon def take_action(self, action): observation, reward, done = self.atari.step(action) training_reward = self.process_reward(reward, observation) # Store action, reward and done with the next observation self.replay_memory.store_transition(action, training_reward, done, observation) return observation, reward, done def process_reward(self, reward, frames): if self.config.exploration_bonus: reward += self.exploration_bonus.bonus(frames) if self.config.reward_clipping: reward = max(-self.config.reward_clipping, min(reward, self.config.reward_clipping)) return reward def populate_replay_memory(self): """Play game with random actions to populate the replay memory""" count = 0 done = True while count < self.config.replay_start_size or not done: if done: self.new_game() _, _, done = self.take_action(self.atari.sample_action()) count += 1 self.atari.episode = 0 def log_episode(self, step): self.atari.log_episode(step)
def go(solver_filename, start_iter): check_for_test_vars() start_timestamp = int(time.time()) log_file_name = get_episode_log_filename(start_timestamp) utils.setup_matplotlib() solver = utils.get_solver(solver_filename) net = solver.net frame_dir_name = get_frame_dir_name(start_timestamp) os.makedirs(frame_dir_name) episode_count = 0 atari = Atari(frame_dir_name, episode_count, start_timestamp, show_game()) action = actions.MOVE_RIGHT_AND_FIRE episode_stats = EpisodeStats() dqn = DqnSolver(atari, net, solver, start_timestamp, start_iter) while dqn.iter < xrange(int(1E7)): # 10 million training steps time1 = time.time() experience = atari.experience(EXPERIENCE_WINDOW_SIZE, action) time2 = time.time() print '%s function took %0.3f ms' % \ ('experience', (time2 - time1) * 1000.0) time1 = time.time() q, action = dqn.perceive(experience) time2 = time.time() print '%s function took %0.3f ms' %\ ('perceive', (time2 - time1) * 1000.0) time1 = time.time() exploit = dqn.should_exploit() time2 = time.time() print '%s function took %0.3f ms' %\ ('should-exploit', (time2 - time1) * 1000.0) if not exploit: action = actions.get_random_action() time1 = time.time() episode_stat = dqn.learn_from_experience_replay() time2 = time.time() print '%s function took %0.3f ms' %\ ('learn', (time2 - time1) * 1000.0) time1 = time.time() dqn.record_episode_stats(episode_stats, experience, q, action, exploit, episode_stat) time2 = time.time() print '%s function took %0.3f ms' %\ ('record', (time2 - time1) * 1000.0) if atari.game_over or 'TEST_AFTER_GAME' in os.environ: EpisodeStats.log_csv(episode_count, episode_stats, log_file_name) episode_count += 1 episode_stats = EpisodeStats() atari.stop() if 'TEST_AFTER_GAME' in os.environ: return atari = Atari(frame_dir_name, episode_count, start_timestamp, show_game()) dqn.iter += 1 print 'dqn iteration: ', dqn.iter
# (1,1,512). # Cкорость обучения для оптимизатора Adam LEARNING_RATE = 0.00001 # Set to 0.00025 in Pong for quicker results. TAU = 0.08 # The merging rate of the weight values between the primary and target networks # Hessel et al. 2017 used 0.0000625 # Размер пачки для обучения BS = 32 # Batch size # For compatibility PATH = "output/" # Gifs and checkpoints will be saved here SUMMARIES = "summaries" # logdir for tensorboard RUNID = 'run_1' os.makedirs(PATH, exist_ok=True) # os.makedirs(os.path.join(SUMMARIES, RUNID), exist_ok=True) # SUMM_WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUNID)) atari = Atari(ENV_NAME, NO_OP_STEPS) print("The environment has the following {} actions: {}".format( atari.env.action_space.n, atari.env.unwrapped.get_action_meanings())) # input_shape = (BS, 84, 84, 4) MAIN_DQN = MyModel(atari.env.action_space.n, learning_rate=LEARNING_RATE) MAIN_DQN.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss=tf.keras.losses.Huber()) # MAIN_DQN(np.zeros(input_shape)) # build # MAIN_DQN.summary() # and show summary TARGET_DQN = MyModel(atari.env.action_space.n, learning_rate=LEARNING_RATE) TARGET_DQN.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.Huber()) #_ = TARGET_DQN(np.zeros(input_shape)) # build
class Agent(object): def __init__(self, policy_network, replay_memory, summary, config): self.config = config self.policy_network = policy_network self.replay_memory = replay_memory self.summary = summary # Create environment self.atari = Atari(summary, config) self.exploration_bonus = ExplorationBonus(config) def new_game(self): self.policy_network.sample_head() observation, reward, done = self.atari.reset() self.replay_memory.store_new_episode(observation) return observation, reward, done def action(self, session, step, observation): # Epsilon greedy exploration/exploitation even for bootstrapped DQN if self.config.LLL: [e_vals, vals] = session.run( [self.policy_network.action_values, self.policy_network.action_e_values], {self.policy_network.inputs.observations: [observation], self.policy_network.inputs.alive: np.reshape([1],(1,1))}) return np.argmax(vals - self.epsilon(step) * np.log(-np.log(e_vals))) elif np.random.rand() < self.epsilon(step): return self.atari.sample_action() else: [action] = session.run( self.policy_network.choose_action, {self.policy_network.inputs.observations: [observation]}) return action def epsilon(self, step): """Epsilon is linearly annealed from an initial exploration value to a final exploration value over a number of steps""" initial = self.config.initial_exploration final = self.config.final_exploration final_frame = self.config.final_exploration_frame annealing_rate = (initial - final) / final_frame annealed_exploration = initial - (step * annealing_rate) epsilon = max(annealed_exploration, final) self.summary.epsilon(step, epsilon) return epsilon def take_action(self, action, last_observation=None, session=None): observation, reward, done = self.atari.step(action) if self.config.e_exploration_bonus: if session is None: e_value = 0.5 elif self.config.actor_critic: [e_value] = session.run( self.policy_network.evalue, {self.policy_network.inputs.observations: [observation], self.policy_network.inputs.alive: np.reshape([1],(1,1))}) e_value = e_value*-1 else: [e_value] = session.run( self.policy_network.taken_action_e_value, {self.policy_network.inputs.observations: [last_observation], self.policy_network.inputs.action: np.reshape([action],(1,1)), self.policy_network.inputs.alive: np.reshape([1],(1,1))}) else: e_value = 0 training_reward = self.process_reward(reward, observation, e_value) # Store action, reward and done with the next observation self.replay_memory.store_transition(action, training_reward, done, observation) return observation, reward, done def process_reward(self, reward, frames, e_value): if self.config.exploration_bonus: reward += self.exploration_bonus.bonus(frames) if self.config.e_exploration_bonus: counter = -np.log(e_value) exploration_bonus = self.config.exploration_beta / ((counter + 0.01)**0.5) reward += exploration_bonus if self.config.reward_clipping: reward = max(-self.config.reward_clipping, min(reward, self.config.reward_clipping)) return reward def populate_replay_memory(self): """Play game with random actions to populate the replay memory""" count = 0 done = True while count < self.config.replay_start_size or not done: if done: self.new_game() _, _, done = self.take_action(self.atari.sample_action()) count += 1 self.atari.episode = 0 def log_episode(self, step): self.atari.log_episode(step)
# (1,1,512). # Cкорость обучения для оптимизатора Adam LEARNING_RATE = 0.00001 # Set to 0.00025 in Pong for quicker results. TAU = 0.08 # The merging rate of the weight values between the primary and target networks # Hessel et al. 2017 used 0.0000625 # Размер пачки для обучения BS = 32 # Batch size # For compatibility PATH = "output/" # Gifs and checkpoints will be saved here SUMMARIES = "summaries" # logdir for tensorboard RUNID = 'run_1' os.makedirs(PATH, exist_ok=True) # os.makedirs(os.path.join(SUMMARIES, RUNID), exist_ok=True) # SUMM_WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUNID)) atari = Atari(ENV_NAME, NO_OP_STEPS) print("The environment has the following {} actions: {}".format(atari.env.action_space.n, atari.env.unwrapped.get_action_meanings())) BASE_DIR = os.path.join('c:\\Python\\gymgames\\', 'DQNMODEL') if isinstance(atari.env.observation_space, Box): s_dim = atari.env.observation_space.shape[0] if len(atari.env.observation_space.shape) == 1 else 0 else: s_dim = int(atari.env.observation_space.n) if len(atari.env.observation_space.shape) == 3: visual_sources = 1 visual_resolution = list(atari.env.observation_space.shape) visual_resolution = [84, 84, 4]
from dqn import * from atari import Atari T = 10000 UPDATE_TIME = 100 if __name__ == '__main__': atari = Atari('breakout.bin') actions = atari.legal_actions dqn = DQN(actions) state = atari.newGame() state = np.stack((state, state, state, state), axis=2).reshape((84, 84, 4)) sess = tf.InteractiveSession() sess.run(tf.initialize_all_variables()) for _ in range(T): action = dqn.selectAction(state) next_state, reward, game_over = atari.next(action) next_state = np.append(next_state, state, axis=2)[:, :, 1:] dqn.storeExperience(state, action, reward, next_state, game_over) minibatch = dqn.sampleExperiences() state_batch = [experience[0] for experience in minibatch] nextState_batch = [experience[3] for experience in minibatch] action_batch = [experience[1] for experience in minibatch] terminal_batch = [experience[4] for experience in minibatch] reward_batch = [experience[2] for experience in minibatch] y_batch = [] Q_batch = sess.run( dqn.targetQNet.QValue,