예제 #1
0
    def log_training_start_information(self):
        text = ("\n   Agent: {}\n".format(self.agent_name) +
                "   ActionWrapper: {}\n".format(self.action_wrapper_name) +
                "   StateBuilder: {}\n".format(self.state_builder_name) +
                "   RewardBuilder: {}\n".format(self.reward_builder_name) +
                "   Environment: {}\n".format(self.env_name) +
                "   Model: {}\n".format(self.model_name))

        if (hasattr(self.model, "lib")):
            if self.model.neural_net_class != None:
                if self.model.lib == constants.Libraries.KERAS:
                    stringlist = []
                    self.model.dnn.model.summary(
                        print_fn=lambda x: stringlist.append(x))
                    short_model_summary = "\n".join(stringlist)
                    text += "       " + short_model_summary
                if self.model.lib == constants.Libraries.PYTORCH:
                    text += "       " + self.model.dnn.model
            else:
                for idx, (layer) in enumerate(self.model.build_model):
                    text += "       Layer {}: {}\n".format(
                        idx, self.model.build_model[idx])
        else:
            for idx, (layer) in enumerate(self.model.build_model):
                text += "       Layer {}: {}\n".format(
                    idx, self.model.build_model[idx])

        self.training_report += text

        rp.report(text)
예제 #2
0
 def save(self, savepath):
     '''
     This method saves pickle objects
     and extra stuff needed
     '''
     rp.report("Saving {} object...".format(self.__class__.__name__), verbosity_lvl=1)
     self.save_pickle(savepath)
     self.save_extra(savepath)
예제 #3
0
    def ask_for_continue(self):
        if self.version != self.__curr_version:
            answer = ""
            while answer.lower() != "y" and answer.lower() != "n":
                answer = rp.input("The loaded training version is {} and the current version is {}. This difference can cause some kind of error while proceeding to the training, do you wish to continue? [y/n]".format(self.version, self.__curr_version), "n")

                if answer.lower() == "n":
                    rp.report("The training was stopped.")
                    exit()
예제 #4
0
    def get_sc2_reward(self, obs):
        build_supply_depot = BuildUnitsGeneralizedRewardBuilder.ACTION_BUILD_SUPPLY_DEPOT
        build_barrack = BuildUnitsGeneralizedRewardBuilder.ACTION_BUILD_BARRACK
        build_marine = BuildUnitsGeneralizedRewardBuilder.ACTION_BUILD_MARINE
        do_nothing = BuildUnitsGeneralizedRewardBuilder.ACTION_DO_NOTHING

        current = self.get_sc2_number_of_supply_depot(obs)
        prev = self.get_sc2_number_of_supply_depot(self.previous_state)
        supply_depot_amount_diff = (current - prev)

        current = self.get_sc2_number_of_barracks(obs)
        prev = self.get_sc2_number_of_barracks(self.previous_state)
        barracks_amount_diff = (current - prev)

        current = self.get_sc2_number_of_marines(obs)
        prev = self.get_sc2_number_of_marines(self.previous_state)
        marines_amount_diff = (current - prev)

        negative_rwd = 0
        chosen_action = BuildUnitsGeneralizedRewardBuilder.LAST_CHOSEN_ACTION
        if chosen_action > -1:
            supply_depot_amount = self.get_sc2_number_of_supply_depot(obs)
            barracks_amount = self.get_sc2_number_of_barracks(obs)
            minerals = obs.player.minerals
            if chosen_action == build_supply_depot:
                if supply_depot_amount > 7 or minerals < 100:
                    negative_rwd = -10
            elif chosen_action == build_barrack:
                if supply_depot_amount <= 0 or minerals < 150:
                    negative_rwd = -10
            elif chosen_action == build_marine:
                if barracks_amount <= 0 or minerals < 50:
                    negative_rwd = -10
            #elif chosen_action == do_nothing:
            #    negative_rwd = -1

        #rwd = negative_rwd + rwdB + rwdC
        rp.report('''
Calculated reward is: {},
composed of:
supply_depot_amount: {},
barracks_amount: {},
marines_amount: {},
negative_rdw: {}
                '''.format(
            negative_rwd + supply_depot_amount_diff +
            barracks_amount_diff * 10 + marines_amount_diff * 100,
            supply_depot_amount_diff, barracks_amount_diff * 10,
            marines_amount_diff * 100, negative_rwd),
                  verbosity_lvl=1)
        if supply_depot_amount_diff < 0 or barracks_amount_diff < 0 or marines_amount_diff < 0:
            return 0
        else:
            rwd = negative_rwd + supply_depot_amount_diff + barracks_amount_diff * 10 + marines_amount_diff * 100
            return rwd
예제 #5
0
 def load_pickle(self, persist_path):
     '''
     This method loads a list instance
     saved by pickle.
     '''
     #Check if pickle file exists
     pickle_path = self.get_full_persistance_pickle_path(persist_path)
     exists_pickle = os.path.isfile(pickle_path)
     #If yes, load it
     if exists_pickle:
         if os.path.getsize(pickle_path) > 0: 
             with open(pickle_path, "rb") as pickle_in: 
                 pickle_dict = pickle.load(pickle_in)
                 self.restore_pickleable_attributes(pickle_dict)
                 rp.report("**************************************** \n Pickle for " + self.get_default_save_stamp() + " loaded. \n****************************************", 1)
예제 #6
0
    def log_train_stats(self):
        if self.ep_count > 0:
            text = ("\n" + "Current Reward Avg.: {}".format(
                sum(self.ep_rewards) / self.ep_count) +
                    " Win rate: {:10.3f}%".format(
                        (sum(self.ep_victories) / self.ep_count) * 100) +
                    " Avg number of steps: {}".format(
                        sum(self.ep_avg_steps) / self.ep_count) +
                    " Training Duration (seconds): {}".format(
                        round(time() - self.training_start, 2)) + "\n")

            self.training_report += text

            rp.report(text)
        else:
            rp.report("There are no recorded episodes!")
예제 #7
0
    def step(self, action):
        if (self.game == GeneralizedBuildUnitsScenario.GAME_DEEP_RTS):
            BuildUnitsGeneralizedRewardBuilder.LAST_CHOSEN_ACTION = action
            if self.steps == 0:
                self.setup_map()
                self.spawn_army()
            elif self.steps == 1:
                self.collect_gold()

            if rp.VERBOSITY_LEVEL > 0:
                str_ = '''  DRTS Episode Status:
                 Number of gold = {},
                 Number of barracks = {},
                 Number of farms = {},
                 Number of soldiers = {}'''.format(
                         self.env.players[0].gold,
                         self.get_drts_unit_type_count(0, self.env.constants.Unit.Barracks),
                         self.get_drts_unit_type_count(0, self.env.constants.Unit.Farm),
                         self.get_drts_unit_type_count(0, self.env.constants.Unit.Footman),
                         )
                rp.report(str_, verbosity_lvl=1)
            state, reward, done = None, None, None 
            if action == GeneralizedBuildUnitsScenario.ACTION_DRTS_DO_NOTHING:
                no_action = 15
                state, reward, done = self.env.step(no_action)
            elif action == GeneralizedBuildUnitsScenario.ACTION_DRTS_BUILD_FARM:
                self.build_farm()
                no_action = 15
                state, reward, done = self.env.step(no_action)
            elif action == GeneralizedBuildUnitsScenario.ACTION_DRTS_BUILD_BARRACK:
                self.build_barrack()
                no_action = 15
                state, reward, done = self.env.step(no_action)
            elif action == GeneralizedBuildUnitsScenario.ACTION_DRTS_BUILD_FOOTMAN:
                self.build_footman()
                no_action = 15
                state, reward, done = self.env.step(no_action)
            else:
                state, reward, done = self.env.step(action)
            self.steps += 1
            return state, reward, done 

        elif (self.game == GeneralizedBuildUnitsScenario.GAME_STARCRAFT_II):
            self.steps += 1
            return self.env.step(action)
예제 #8
0
    def log_ep_stats(self):
        if self.ep_count > 0:

            agent_info = dict.fromkeys(self.agent_info)
            for key in agent_info:
                agent_info[key] = self.agent_info[key][-1]

            rp.report(
                "Episode: {}/{} | Outcome: {} | Episode Avg. Reward: {:10.6f} | Episode Reward: {:10.6f} | Episode Steps: {:10.6f} | Best Reward was {} on episode: {} | Episode Duration (seconds): {} | Episode SPS: {} | SPS AVG: {} | Agent info: {}"
                .format(self.ep_count, self.ep_total, self.ep_victories[-1],
                        self.ep_avg_rewards[-1], self.ep_rewards[-1],
                        self.ep_steps_count[-1], self.best_reward,
                        self.best_reward_episode,
                        self.episode_duration_list[-1],
                        self.episode_sps_list[-1], self.avg_sps_list[-1],
                        agent_info))
        else:
            rp.report("There are no recorded episodes!")
예제 #9
0
    def test_agent(self):
        #backup attributes
        max_test_episodes_backup = self.max_test_episodes
        curr_playing_episodes_backup = self.curr_playing_episodes
        logger_backup = self.logger
        #full_save_play_path_backup = self.full_save_play_path
        enable_save_backup = self.enable_save

        #set attributes to test agent
        self.enable_save = False
        #self.full_save_play_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format(self.curr_training_episodes)
        #self.make_persistance_dirs(self.log_actions)
        self.max_test_episodes = self.reward_test_number_of_episodes
        self.curr_playing_episodes = 0

        rp.report("> Starting to check current agent performance.")
        #make the agent play
        self.play()
        rp.report("> Finished checking current agent performance.")

        #get_reward_avg
        rwd_avg = self.logger.ep_avg_rewards[-1]
        #save this logger for later saving
        #this is needed to get some more detailed
        #info on tests
        logger_dict = {}
        logger_dict["logger"] = self.logger
        logger_dict["saved"] = False
        self.inside_training_test_loggers.append(logger_dict)

        #restore backup
        self.max_test_episodes = max_test_episodes_backup
        self.curr_playing_episodes = curr_playing_episodes_backup
        self.logger = logger_backup
        #self.full_save_play_path = full_save_play_path_backup
        self.enable_save = enable_save_backup

        #register reward avg:
        self.logger.inside_training_test_avg_rwds.append(rwd_avg)
예제 #10
0
    def setup(self,
              env,
              agent,
              max_training_episodes,
              max_test_episodes,
              max_steps_training,
              max_steps_testing,
              save_path=os.path.expanduser("~") + os.path.sep +
              "urnai_saved_traingings",
              file_name=str(datetime.now()).replace(" ", "_").replace(
                  ":", "_").replace(".", "_"),
              enable_save=True,
              save_every=10,
              relative_path=False,
              debug_level=0,
              reset_epsilon=False,
              tensorboard_logging=False,
              log_actions=True,
              episode_batch_avg_calculation=10,
              do_reward_test=False,
              reward_test_number_of_episodes=10,
              rolling_avg_window_size=20):
        self.versioner = Versioner()
        self.env = env
        self.agent = agent
        self.save_path = save_path
        self.file_name = file_name
        self.enable_save = enable_save
        self.save_every = save_every
        self.relative_path = relative_path
        self.reset_epsilon = reset_epsilon
        self.max_training_episodes = max_training_episodes
        self.max_test_episodes = max_test_episodes
        self.max_steps_training = max_steps_training
        self.max_steps_testing = max_steps_testing
        self.curr_training_episodes = -1
        self.curr_playing_episodes = -1
        rp.VERBOSITY_LEVEL = debug_level
        self.tensorboard_logging = tensorboard_logging
        self.log_actions = log_actions
        self.episode_batch_avg_calculation = episode_batch_avg_calculation
        self.do_reward_test = do_reward_test
        self.reward_test_number_of_episodes = reward_test_number_of_episodes
        self.rolling_avg_window_size = rolling_avg_window_size
        self.inside_training_test_loggers = []

        self.logger = Logger(
            0,
            self.agent.__class__.__name__,
            self.agent.model.__class__.__name__,
            self.agent.model,
            self.agent.action_wrapper.__class__.__name__,
            self.agent.action_wrapper.get_action_space_dim(),
            self.agent.action_wrapper.get_named_actions(),
            self.agent.state_builder.__class__.__name__,
            self.agent.reward_builder.__class__.__name__,
            self.env.__class__.__name__,
            log_actions=self.log_actions,
            episode_batch_avg_calculation=self.episode_batch_avg_calculation,
            rolling_avg_window_size=self.rolling_avg_window_size)

        # Adding epsilon, learning rate and gamma factors to our pickle black list,
        # so that they are not loaded when loading the model's weights.
        # Making it so that the current training session acts as a brand new training session
        # (except for the fact that the model's weights may already be somewhat optimized from previous trainings)
        if self.reset_epsilon:
            self.agent.model.pickle_black_list.append("epsilon_greedy")
            self.agent.model.pickle_black_list.append("epsilon_decay_rate")
            self.agent.model.pickle_black_list.append("epsilon_min")
            self.agent.model.pickle_black_list.append("gamma")
            self.agent.model.pickle_black_list.append("learning_rate")
            self.agent.model.pickle_black_list.append("learning_rate_min")
            self.agent.model.pickle_black_list.append("learning_rate_decay")
            self.agent.model.pickle_black_list.append(
                "learning_rate_decay_ep_cutoff")

        currentdir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        parentdir = os.path.dirname(currentdir)
        parentdir = os.path.dirname(parentdir)
        if (relative_path):
            self.full_save_path = parentdir + os.path.sep + self.save_path + os.path.sep + self.file_name
        else:
            self.full_save_path = self.save_path + os.path.sep + self.file_name

        self.full_save_play_path = self.full_save_path + os.path.sep + "play_files"

        if self.enable_save and os.path.exists(self.full_save_path):
            rp.report("WARNING! Loading training from " + self.full_save_path +
                      " with SAVING ENABLED.")
            self.load(self.full_save_path)
            self.versioner.ask_for_continue()
            self.make_persistance_dirs(self.log_actions)
        elif self.enable_save:
            rp.report("WARNING! Starting new training on " +
                      self.full_save_path + " with SAVING ENABLED.")
            self.make_persistance_dirs(self.log_actions)
        else:
            rp.report(
                "WARNING! Starting new training WITHOUT SAVING PROGRESS.")

        if (self.tensorboard_logging):
            logdir = self.full_save_path + "/tf_logs"
            self.agent.model.tensorboard_callback = [
                tf.keras.callbacks.TensorBoard(log_dir=logdir)
            ]
예제 #11
0
    def training_loop(self, is_testing, reward_from_agent=True):
        start_time = time.time()
        #current_episodes = 0

        if is_testing:
            rp.report("\n\n> Playing")
            max_episodes = self.max_test_episodes
            max_steps = self.max_steps_testing
            current_episodes = self.curr_playing_episodes
        else:
            rp.report("> Training")
            max_episodes = self.max_training_episodes
            max_steps = self.max_steps_training
            current_episodes = self.curr_training_episodes

        if self.logger.ep_count == 0 or is_testing:
            self.logger = Logger(
                max_episodes,
                self.agent.__class__.__name__,
                self.agent.model.__class__.__name__,
                self.agent.model,
                self.agent.action_wrapper.__class__.__name__,
                self.agent.action_wrapper.get_action_space_dim(),
                self.agent.action_wrapper.get_named_actions(),
                self.agent.state_builder.__class__.__name__,
                self.agent.reward_builder.__class__.__name__,
                self.env.__class__.__name__,
                log_actions=self.log_actions,
                episode_batch_avg_calculation=self.
                episode_batch_avg_calculation,
                rolling_avg_window_size=self.rolling_avg_window_size)

        while current_episodes < max_episodes:
            current_episodes += 1
            self.env.start()

            if is_testing:
                self.curr_playing_episodes = current_episodes
            else:
                self.curr_training_episodes = current_episodes

            # Reset the environment
            obs = self.env.reset()
            step_reward = 0
            done = False
            # Passing the episode to the agent reset, so that it can be passed to model reset
            # Allowing the model to track the episode number, and decide if it should diminish the
            # Learning Rate, depending on the currently selected strategy.
            self.agent.reset(current_episodes)

            ep_reward = 0
            victory = False

            ep_actions = np.zeros(
                self.agent.action_wrapper.get_action_space_dim())
            self.logger.record_episode_start()

            for step in range(max_steps):
                # Choosing an action and passing it to our env.step() in order to act on our environment
                action = self.agent.step(obs, done, is_testing)
                # Take the action (a) and observe the outcome state (s') and reward (r)
                obs, default_reward, done = self.env.step(action)

                # Logic to test wheter this is the last step of this episode
                is_last_step = step == max_steps - 1
                done = done or is_last_step

                # Checking whether or not to use the reward from the reward builder so we can pass that to the agent
                if reward_from_agent:
                    step_reward = self.agent.get_reward(
                        obs, default_reward, done)
                else:
                    step_reward = default_reward

                # Making the agent learn
                if not is_testing:
                    self.agent.learn(obs, step_reward, done)

                # Adding our step reward to the total count of the episode's reward
                ep_reward += step_reward
                ep_actions[self.agent.previous_action] += 1

                if done:
                    victory = default_reward == 1
                    agent_info = {
                        "Learning rate": self.agent.model.learning_rate,
                        "Gamma": self.agent.model.gamma,
                        "Epsilon": self.agent.model.epsilon_greedy,
                    }
                    self.logger.record_episode(ep_reward, victory, step + 1,
                                               agent_info, ep_actions)
                    break

            self.logger.log_ep_stats()

            # check if user wants to pause training and test agent
            # if self.do_reward_test and current_episodes % self.episode_batch_avg_calculation == 0 and current_episodes > 1:
            if (
                    not is_testing
            ) and self.do_reward_test and current_episodes % self.episode_batch_avg_calculation == 0:
                self.test_agent()

            # if this is not a test (evaluation), saving is enabled and we are in a multiple
            # of our save_every variable then we save the model and generate graphs
            if (
                    not is_testing
            ) and self.enable_save and current_episodes > 0 and current_episodes % self.save_every == 0:
                self.save(self.full_save_path)

                # if we have done tests along the training save all loggers for further detailed analysis
                if self.do_reward_test and len(
                        self.inside_training_test_loggers) > 0:
                    for idx in range(
                            len(self.logger.ep_avg_batch_rewards_episodes)):
                        logger_dict = self.inside_training_test_loggers[idx]
                        if not logger_dict["saved"]:
                            episode = self.logger.ep_avg_batch_rewards_episodes[
                                idx]
                            backup_full_save_path = self.full_save_path
                            self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format(
                                episode)
                            self.make_persistance_dirs(self.log_actions)
                            logger_dict["logger"].save(self.full_save_path)
                            logger_dict["saved"] = True
                            self.full_save_path = backup_full_save_path

        end_time = time.time()
        if is_testing:
            rp.report("\n> Test duration: {} seconds".format(end_time -
                                                             start_time))
            self.logger.log_train_stats()
        else:
            rp.report("\n> Training duration: {} seconds".format(end_time -
                                                                 start_time))
            self.logger.log_train_stats()

        # Saving the model at the end of the training loop
        if self.enable_save:
            if is_testing:
                self.logger.save(self.full_save_play_path)
                rp.save(self.full_save_play_path)
            else:
                self.save(self.full_save_path)

                # if we have done tests along the training save all loggers for further detailed analysis
                if self.do_reward_test and len(
                        self.inside_training_test_loggers) > 0:
                    for idx in range(
                            len(self.logger.ep_avg_batch_rewards_episodes)):
                        logger_dict = self.inside_training_test_loggers[idx]
                        if not logger_dict["saved"]:
                            episode = self.logger.ep_avg_batch_rewards_episodes[
                                idx]
                            backup_full_save_path = self.full_save_path
                            self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format(
                                episode)
                            self.make_persistance_dirs(self.log_actions)
                            logger_dict["logger"].save(self.full_save_path)
                            logger_dict["saved"] = True
                            self.full_save_path = backup_full_save_path
예제 #12
0
    def old_play(self, test_params=None, reward_from_agent=True):
        rp.report("\n\n> Playing")

        self.logger = Logger(
            self.max_test_episodes,
            self.agent.__class__.__name__,
            self.agent.model.__class__.__name__,
            self.agent.model,
            self.agent.action_wrapper.__class__.__name__,
            self.agent.action_wrapper.get_action_space_dim(),
            self.agent.action_wrapper.get_named_actions(),
            self.agent.state_builder.__class__.__name__,
            self.agent.reward_builder.__class__.__name__,
            self.env.__class__.__name__,
            log_actions=self.log_actions,
            episode_batch_avg_calculation=self.episode_batch_avg_calculation,
            rolling_avg_window_size=self.rolling_avg_window_size)

        while self.curr_playing_episodes < self.max_test_episodes:
            self.curr_playing_episodes += 1
            self.env.start()

            # Reset the environment
            obs = self.env.reset()
            step_reward = 0
            done = False
            # Passing the episode to the agent reset, so that it can be passed to model reset
            # Allowing the model to track the episode number, and decide if it should diminish the
            # Learning Rate, depending on the currently selected strategy.
            self.agent.reset(self.curr_playing_episodes)

            ep_reward = 0
            victory = False

            ep_actions = np.zeros(
                self.agent.action_wrapper.get_action_space_dim())
            self.logger.record_episode_start()

            for step in range(self.max_steps_testing):
                action = self.agent.step(obs, done, is_testing=True)
                # Take the action (a) and observe the outcome state(s') and reward (r)
                obs, default_reward, done = self.env.step(action)

                is_last_step = step == self.max_steps_testing - 1
                done = done or is_last_step

                if reward_from_agent:
                    step_reward = self.agent.get_reward(
                        obs, default_reward, done)
                else:
                    step_reward = default_reward

                ep_reward += step_reward

                ep_actions[self.agent.previous_action] += 1

                # If done: finish episode
                if done:
                    victory = default_reward == 1
                    agent_info = {
                        "Learning rate": self.agent.model.learning_rate,
                        "Gamma": self.agent.model.gamma,
                        "Epsilon": self.agent.model.epsilon_greedy,
                    }
                    self.logger.record_episode(ep_reward, victory, step + 1,
                                               agent_info, ep_actions)
                    break

            self.logger.log_ep_stats()

        if test_params != None:
            test_params.logger.record_play_test(test_params.current_ep_count,
                                                self.logger.ep_rewards,
                                                self.logger.victories,
                                                self.max_test_episodes)
        else:
            # Only logs train stats if this is not a test, to avoid cluttering the interface with info
            self.logger.log_train_stats()

        # We need to save playing status as well
        if self.enable_save:
            self.logger.save(self.full_save_play_path)
            rp.save(self.full_save_play_path)
예제 #13
0
    def old_train(self,
                  test_params: TestParams = None,
                  reward_from_agent=True):
        start_time = time.time()

        rp.report("> Training")
        if self.logger.ep_count == 0:
            self.logger = Logger(
                self.max_training_episodes,
                self.agent.__class__.__name__,
                self.agent.model.__class__.__name__,
                self.agent.model,
                self.agent.action_wrapper.__class__.__name__,
                self.agent.action_wrapper.get_action_space_dim(),
                self.agent.action_wrapper.get_named_actions(),
                self.agent.state_builder.__class__.__name__,
                self.agent.reward_builder.__class__.__name__,
                self.env.__class__.__name__,
                log_actions=self.log_actions,
                episode_batch_avg_calculation=self.
                episode_batch_avg_calculation,
                rolling_avg_window_size=self.rolling_avg_window_size)

        if test_params != None:
            test_params.logger = self.logger

        while self.curr_training_episodes < self.max_training_episodes:
            self.curr_training_episodes += 1
            self.env.start()

            # Reset the environment
            obs = self.env.reset()
            step_reward = 0
            done = False
            # Passing the episode to the agent reset, so that it can be passed to model reset
            # Allowing the model to track the episode number, and decide if it should diminish the
            # Learning Rate, depending on the currently selected strategy.
            self.agent.reset(self.curr_training_episodes)

            ep_reward = 0
            victory = False

            ep_actions = np.zeros(
                self.agent.action_wrapper.get_action_space_dim())
            self.logger.record_episode_start()

            for step in range(self.max_steps_training):

                # Choosing an action and passing it to our env.step() in order to act on our environment
                action = self.agent.step(obs, done, is_testing=False)
                obs, default_reward, done = self.env.step(action)

                is_last_step = step == self.max_steps_training - 1
                done = done or is_last_step

                # Checking whether or not to use the reward from the reward builder so we can pass that to the agent
                if reward_from_agent:
                    step_reward = self.agent.get_reward(
                        obs, default_reward, done)
                else:
                    step_reward = default_reward

                # Making the agent learn
                self.agent.learn(obs, step_reward, done)

                # Adding our step reward to the total count of the episode's reward
                ep_reward += step_reward

                ep_actions[self.agent.previous_action] += 1

                if done:
                    victory = default_reward == 1
                    agent_info = {
                        "Learning rate": self.agent.model.learning_rate,
                        "Gamma": self.agent.model.gamma,
                        "Epsilon": self.agent.model.epsilon_greedy,
                    }
                    self.logger.record_episode(ep_reward, victory, step + 1,
                                               agent_info, ep_actions)
                    break

            self.logger.log_ep_stats()

            #check if user wants to pause training and test agent
            #if self.do_reward_test and self.curr_training_episodes % self.episode_batch_avg_calculation == 0 and self.curr_training_episodes > 1:
            if self.do_reward_test and self.curr_training_episodes % self.episode_batch_avg_calculation == 0:
                self.test_agent()

            if self.enable_save and self.curr_training_episodes > 0 and self.curr_training_episodes % self.save_every == 0:
                self.save(self.full_save_path)

                #if we have done tests along the training
                #save all loggers for further detailed analysis
                #this was needed because the play() method
                #was saving these loggers every test, slowing down
                #training a lot. Putting this code here allows
                #to save them once and optimize training time.
                if self.do_reward_test and len(
                        self.inside_training_test_loggers) > 0:
                    for idx in range(
                            len(self.logger.ep_avg_batch_rewards_episodes)):
                        logger_dict = self.inside_training_test_loggers[idx]
                        if not logger_dict["saved"]:
                            episode = self.logger.ep_avg_batch_rewards_episodes[
                                idx]
                            backup_full_save_path = self.full_save_path
                            self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format(
                                episode)
                            self.make_persistance_dirs(self.log_actions)
                            logger_dict["logger"].save(self.full_save_path)
                            logger_dict["saved"] = True
                            self.full_save_path = backup_full_save_path

            if test_params != None and self.curr_training_episodes % test_params.test_steps == 0 and episode != 0:
                test_params.current_ep_count = self.curr_training_episodes
                self.play(test_params.num_matches, test_params.max_steps,
                          test_params)

                # Stops training if reward threshold was reached in play testing
                if test_params.reward_threshold != None and test_params.reward_threshold <= test_params.logger.play_rewards_avg[
                        -1]:
                    rp.report("> Reward threshold was reached!")
                    rp.report("> Stopping training")
                    break

        end_time = time.time()
        rp.report("\n> Training duration: {} seconds".format(end_time -
                                                             start_time))

        self.logger.log_train_stats()
        self.logger.plot_train_stats()
        # Saving the model when the training has ended
        if self.enable_save:
            self.save(self.full_save_path)
            #if we have done tests along the training
            #save all loggers for further detailed analysis
            #this was needed because the play() method
            #was saving these loggers every test, slowing down
            #training a lot. Putting this code here allows
            #to save them once and optimize training time.
            if self.do_reward_test and len(
                    self.inside_training_test_loggers) > 0:
                for idx in range(len(
                        self.logger.ep_avg_batch_rewards_episodes)):
                    logger_dict = self.inside_training_test_loggers[idx]
                    if not logger_dict["saved"]:
                        episode = self.logger.ep_avg_batch_rewards_episodes[
                            idx]
                        backup_full_save_path = self.full_save_path
                        self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format(
                            episode)
                        self.make_persistance_dirs(self.log_actions)
                        logger_dict["logger"].save(self.full_save_path)
                        logger_dict["saved"] = True
                        self.full_save_path = backup_full_save_path
예제 #14
0
    def get_drts_reward(self, obs):
        player = 0
        footman = 5
        farm = 6
        barracks = 4
        build_farm = RTSGeneralization.ACTION_DRTS_BUILD_FARM
        build_barrack = RTSGeneralization.ACTION_DRTS_BUILD_BARRACK
        build_footman = RTSGeneralization.ACTION_DRTS_BUILD_FOOTMAN
        do_nothing = RTSGeneralization.ACTION_DRTS_DO_NOTHING

        current = self.get_drts_number_of_specific_units(obs, player, farm)
        prev = self.get_drts_number_of_specific_units(self.previous_state,
                                                      player, farm)
        farm_amount_curr = (current - prev)

        current = self.get_drts_number_of_specific_units(obs, player, barracks)
        prev = self.get_drts_number_of_specific_units(self.previous_state,
                                                      player, barracks)
        barracks_amount_curr = (current - prev)

        current = self.get_drts_number_of_specific_units(obs, player, footman)
        prev = self.get_drts_number_of_specific_units(self.previous_state,
                                                      player, footman)
        footman_amount_curr = (current - prev)

        negative_rwd = 0
        chosen_action = BuildUnitsGeneralizedRewardBuilder.LAST_CHOSEN_ACTION
        if chosen_action > -1:
            farm_number = self.get_drts_number_of_specific_units(
                obs, player, farm)
            barracks_amount = self.get_drts_number_of_specific_units(
                obs, player, barracks)
            gold_amount = obs['players'][0].gold
            if chosen_action == build_farm:
                if farm_number > 7 or gold_amount < 500:
                    negative_rwd = -10
            elif chosen_action == build_barrack:
                if farm_number <= 0 or gold_amount < 700:
                    negative_rwd = -10
            elif chosen_action == build_footman:
                if barracks_amount <= 0 or gold_amount < 600:
                    negative_rwd = -10
            #elif chosen_action == do_nothing:
            #    negative_rwd = -1

        #rwd = negative_rwd + rwdB + rwdC
        rp.report('''
Calculated reward is: {},
composed of:
farm_amount: {},
barracks_amount: {},
footman_amount: {},
negative_rdw: {}
                '''.format(
            negative_rwd + farm_amount_curr + barracks_amount_curr * 10 +
            footman_amount_curr * 100, farm_amount_curr,
            barracks_amount_curr * 10, footman_amount_curr * 100,
            negative_rwd),
                  verbosity_lvl=1)
        if farm_amount_curr < 0 or barracks_amount_curr < 0 or footman_amount_curr < 0:
            return 0
        else:
            rwd = negative_rwd + farm_amount_curr + barracks_amount_curr * 10 + footman_amount_curr * 100
            return rwd