def measure(name, iters=5000, **settings): print(name) for k, v in settings.items(): print("\t{}: {}".format(k, v)) # Vizdoom wrapper doom_wrapper = VizdoomWrapper(**settings) start = time() for _ in trange(iters, leave=False): current_img, current_misc = doom_wrapper.get_current_state() action_index = randint(0, doom_wrapper.actions_num - 1) doom_wrapper.make_action(action_index) if doom_wrapper.is_terminal(): doom_wrapper.reset() end = time() wrapper_t = (end - start) # Vanilla vizdoom: doom = vzd.DoomGame() if "scenarios_path" not in settings: scenarios_path = vzd.__path__[0] + "/scenarios" else: scenarios_path = settings["scenarios_path"] config_file = scenarios_path + "/" + settings["config_file"] doom.load_config(config_file) doom.set_window_visible(False) doom.set_screen_format(vzd.ScreenFormat.GRAY8) doom.set_screen_resolution(vzd.ScreenResolution.RES_160X120) doom.init() actions = [ list(a) for a in it.product([0, 1], repeat=len(doom.get_available_game_variables())) ] start = time() frame_skip = settings["frame_skip"] for _ in trange(iters, leave=False): if doom.is_episode_finished(): doom.new_episode() doom.make_action(choice(actions), frame_skip) end = time() vanilla_t = end - start print(green("\twrapper: {:0.2f} steps/s".format(iters / wrapper_t))) print( green("\twrapper: {:0.2f} s/1000 steps".format(wrapper_t / iters * 1000))) print(blue("\tvanilla: {:0.2f} steps/s".format(iters / vanilla_t))) print( blue("\tvanilla: {:0.2f} s/1000 steps\n".format(vanilla_t / iters * 1000)))
def _print_train_log(self, scores, overall_start_time, last_log_time, steps): current_time = time.time() mean_score = np.mean(scores) score_std = np.std(scores) min_score = np.min(scores) max_score = np.max(scores) elapsed_time = time.time() - overall_start_time global_steps = self._global_steps_counter.get() local_steps_per_sec = steps / (current_time - last_log_time) global_steps_per_sec = global_steps / elapsed_time global_mil_steps_per_hour = global_steps_per_sec * 3600 / 1000000.0 log( "TRAIN: {}(GlobalSteps), {} episodes, mean: {}, min: {}, max: {}, " "\nLocalSpd: {:.0f} STEPS/s GlobalSpd: " "{} STEPS/s, {:.2f}M STEPS/hour, total elapsed time: {}".format( global_steps, len(scores), green("{:0.3f}±{:0.2f}".format(mean_score, score_std)), red("{:0.3f}".format(min_score)), blue("{:0.3f}".format(max_score)), local_steps_per_sec, blue("{:.0f}".format( global_steps_per_sec)), global_mil_steps_per_hour, sec_to_str(elapsed_time) ))
def test(self, episodes_num=None, deterministic=True): if episodes_num is None: episodes_num = self.test_episodes_per_epoch test_start_time = time.time() test_rewards = [] test_actions = [] test_frameskips = [] for _ in trange(episodes_num, desc="Testing", file=sys.stdout, leave=False, disable=not self.enable_progress_bar): total_reward, actions, frameskips, _ = self.run_episode(deterministic=deterministic, return_stats=True) test_rewards.append(total_reward) test_actions += actions test_frameskips += frameskips self.doom_wrapper.reset() if self.local_network.has_state(): self.local_network.reset_state() test_end_time = time.time() test_duration = test_end_time - test_start_time min_score = np.min(test_rewards) max_score = np.max(test_rewards) mean_score = np.mean(test_rewards) score_std = np.std(test_rewards) log( "TEST: mean: {}, min: {}, max: {}, test time: {}".format( green("{:0.3f}±{:0.2f}".format(mean_score, score_std)), red("{:0.3f}".format(min_score)), blue("{:0.3f}".format(max_score)), sec_to_str(test_duration))) return test_rewards, test_actions, test_frameskips
def print_epoch_log(prefix, scores, steps, epoch_time): mean_score = np.mean(scores) score_std = np.std(scores) min_score = np.min(scores) max_score = np.max(scores) episodes = len(scores) steps_per_sec = steps / epoch_time mil_steps_per_hour = steps_per_sec * 3600 / 1000000.0 log("{}: Episodes: {}, mean: {}, min: {}, max: {}, " " Speed: {:.0f} STEPS/s, {:.2f}M STEPS/hour, time: {}".format( prefix, episodes, green("{:0.3f}±{:0.2f}".format(mean_score, score_std)), red("{:0.3f}".format(min_score)), blue("{:0.3f}".format(max_score)), steps_per_sec, mil_steps_per_hour, sec_to_str(epoch_time)))
def train_async(q_learning, settings): proto_vizdoom = VizdoomWrapper(noinit=True, **settings) actions_num = proto_vizdoom.actions_num misc_len = proto_vizdoom.misc_len img_shape = proto_vizdoom.img_shape del proto_vizdoom # TODO target global network # This global step counts gradient applications not performed actions. global_train_step = tf.Variable(0, trainable=False, name="global_step") global_learning_rate = tf.train.polynomial_decay( name="larning_rate", learning_rate=settings["initial_learning_rate"], end_learning_rate=settings["final_learning_rate"], decay_steps=settings["learning_rate_decay_steps"], global_step=global_train_step) optimizer = ClippingRMSPropOptimizer(learning_rate=global_learning_rate, **settings["rmsprop"]) learners = [] network_class = eval(settings["network_type"]) global_network = network_class(actions_num=actions_num, misc_len=misc_len, img_shape=img_shape, **settings) global_steps_counter = ThreadsafeCounter() if q_learning: global_target_network = network_class(thread="global_target", actions_num=actions_num, misc_len=misc_len, img_shape=img_shape, **settings) global_network.prepare_unfreeze_op(global_target_network) unfreeze_thread = min(1, settings["threads_num"] - 1) for i in range(settings["threads_num"]): learner = ADQNLearner(thread_index=i, global_network=global_network, unfreeze_thread=i == unfreeze_thread, global_target_network=global_target_network, optimizer=optimizer, learning_rate=global_learning_rate, global_steps_counter=global_steps_counter, **settings) learners.append(learner) else: for i in range(settings["threads_num"]): learner = A3CLearner(thread_index=i, global_network=global_network, optimizer=optimizer, learning_rate=global_learning_rate, global_steps_counter=global_steps_counter, **settings) learners.append(learner) config = tf.ConfigProto() config.gpu_options.allow_growth = True session = tf.Session(config=config) log("Initializing variables...") session.run(tf.global_variables_initializer()) log("Initialization finished.\n") if q_learning: session.run(global_network.ops.unfreeze) log(green("Starting training.\n")) for l in learners: l.run_training(session) for l in learners: l.join()
def train(self, session): # Prefill replay memory: for _ in trange(self.replay_memory.capacity, desc="Filling replay memory", leave=False, disable=not self.enable_progress_bar, file=sys.stdout): if self.doom_wrapper.is_terminal(): self.doom_wrapper.reset() s1 = self.doom_wrapper.get_current_state() action_frameskip_index = randint(0, self.actions_num * len(self.frameskips) - 1) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) reward = self.doom_wrapper.make_action(action_index, frameskip) terminal = self.doom_wrapper.is_terminal() s2 = self.doom_wrapper.get_current_state() self.replay_memory.add_transition(s1, action_frameskip_index, s2, reward, terminal) overall_start_time = time() self.network.update_target_network(session) log(green("Starting training.\n")) while self._epoch <= self._epochs: self.doom_wrapper.reset() train_scores = [] test_scores = [] train_start_time = time() for _ in trange(self.train_steps_per_epoch, desc="Training, epoch {}".format(self._epoch), leave=False, disable=not self.enable_progress_bar, file=sys.stdout): self.steps += 1 s1 = self.doom_wrapper.get_current_state() if random() <= self.get_current_epsilon(): action_frameskip_index = randint(0, self.actions_num*len(self.frameskips) - 1) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) else: action_frameskip_index = self.network.get_action(session, s1) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) reward = self.doom_wrapper.make_action(action_index, frameskip) terminal = self.doom_wrapper.is_terminal() s2 = self.doom_wrapper.get_current_state() self.replay_memory.add_transition(s1, action_frameskip_index, s2, reward, terminal) if self.steps % self.update_pattern[0] == 0: for _ in range(self.update_pattern[1]): self.network.train_batch(session, self.replay_memory.get_sample()) if terminal: train_scores.append(self.doom_wrapper.get_total_reward()) self.doom_wrapper.reset() if self.steps % self.frozen_steps == 0: self.network.update_target_network(session) train_time = time() - train_start_time log("Epoch {}".format(self._epoch)) log("Training steps: {}, epsilon: {}".format(self.steps, self.get_current_epsilon())) self.print_epoch_log("TRAIN", train_scores, self.train_steps_per_epoch, train_time) test_start_time = time() test_steps = 0 # TESTING for _ in trange(self.test_episodes_per_epoch, desc="Testing, epoch {}".format(self._epoch), leave=False, disable=not self.enable_progress_bar, file=sys.stdout): self.doom_wrapper.reset() while not self.doom_wrapper.is_terminal(): test_steps += 1 state = self.doom_wrapper.get_current_state() action_frameskip_index = self.network.get_action(session, state) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) self.doom_wrapper.make_action(action_index, frameskip) test_scores.append(self.doom_wrapper.get_total_reward()) test_time = time() - test_start_time self.print_epoch_log("TEST", test_scores, test_steps, test_time) if self.write_summaries: log("Writing summaries.") train_summary = session.run(self._summaries, {self.scores_placeholder: train_scores}) self._train_writer.add_summary(train_summary, self.steps) if self._run_tests: test_summary = session.run(self._summaries, {self.scores_placeholder: test_scores}) self._test_writer.add_summary(test_summary, self.steps) # Save model if self._epoch % self.save_interval == 0: savedir = os.path.dirname(self._model_savefile) if not os.path.exists(savedir): log("Creating directory: {}".format(savedir)) os.makedirs(savedir) log("Saving model to: {}".format(self._model_savefile)) saver = tf.train.Saver() saver.save(session, self._model_savefile) overall_time = time() - overall_start_time log("Total elapsed time: {}\n".format(sec_to_str(overall_time))) self._epoch += 1