示例#1
0
    def __init__(self, env, learning_rate, buffer_size, batch_size, n_epochs,
                 gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm):
        self.env = env
        self.lr = learning_rate
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.gamma = gamma
        self.gae_lam = gae_lam
        self.clip_range = clip_range
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.num_timesteps = 0

        self.ep_info_buffer = deque(maxlen=100)
        self._n_updates = 0
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        if isinstance(env, VecEnv):
            self.num_envs = env.num_envs

        self.rms_obs = RunningMeanStd(shape=(1, 1, 84, 84))
        self.rms_rew = RunningMeanStd()

        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        logger.configure('./logs')
示例#2
0
    def __init__(self, env_id, lr, nstep, batch_size, n_epochs, gamma, gae_lam,
                 clip_range, ent_coef, vf_coef, max_grad_norm):

        self.env_id = env_id

        self.env = make_env(env_id, n_envs=4)

        self.num_envs = self.env.num_envs if isinstance(self.env,
                                                        VecEnv) else 1
        self.state_dim = self.env.observation_space.shape[0]
        self.action_converter = ActionConverter(self.env.action_space)

        self.lr = lr
        self.nstep = nstep
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.gamma = gamma
        self.gae_lam = gae_lam
        self.clip_range = clip_range
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm

        self.ep_info_buffer = deque(maxlen=50)
        self._n_updates = 0
        self.num_timesteps = 0
        self.num_episodes = 0

        self.obs_rms = RunningMeanStd()
示例#3
0
    def __init__(self,
                 *,
                 env_id,
                 lr=3e-4,
                 nstep=128,
                 batch_size=128,
                 n_epochs=10,
                 gamma=0.99,
                 int_gamma=0.99,
                 gae_lam=0.95,
                 clip_range=0.2,
                 ent_coef=.01,
                 vf_coef=0.5,
                 int_vf_coef=0.5,
                 max_grad_norm=0.2,
                 hidden_size=128,
                 int_hidden_size=128,
                 int_lr=3e-4,
                 rnd_start=1e+3):
        super(PPO_RND, self).__init__(env_id, lr, nstep, batch_size, n_epochs,
                                      gamma, gae_lam, clip_range, ent_coef,
                                      vf_coef, max_grad_norm)

        self.policy = Policy(self.env, hidden_size, intrinsic_model=True)
        self.rnd = RndNetwork(self.state_dim, hidden_size=int_hidden_size)
        self.rollout = IntrinsicStorage(nstep,
                                        self.num_envs,
                                        self.env.observation_space,
                                        self.env.action_space,
                                        gae_lam=gae_lam,
                                        gamma=gamma,
                                        int_gamma=int_gamma)
        self.optimizer = optim.Adam(self.policy.net.parameters(), lr=lr)
        self.rnd_optimizer = optim.Adam(self.rnd.parameters(), lr=int_lr)

        self.rnd_start = rnd_start
        self.int_vf_coef = int_vf_coef

        self.last_obs = self.env.reset()

        self.int_rew_rms = RunningMeanStd()

        self.normalize = True
        self.last_dones = np.array([0 for _ in range(self.num_envs)])
示例#4
0
    def initialize(self,
                   session_name="default_session",
                   num_slaves=8,
                   tps=10000,
                   use_evaluation=False):

        # get parameters from config
        self._numSlaves = num_slaves

        self._gamma = 0.99
        self._lambd = 0.95
        self._clipRange = 0.2

        self._learningRatePolicy = 1e-4
        self._learningRatePolicyDecay = 0.9993
        self._learningRateValueFunction = 1e-3

        self._batchSize = 1024
        self._transitionsPerIteration = 20000

        # if useEvaluation is true, evaluation of training progress is performed by evaluation function, else it is done by transitions collected in training session.
        self._useEvaluation = use_evaluation

        self._sessionName = session_name

        # initialize environment
        # TODO
        agents = [
            holodeck.agents.AgentDefinition(
                agent_name="android" + str(i),
                agent_type=holodeck.agents.AndroidAgent,
                sensors=[holodeck.sensors.CustomSensor],
                starting_loc=(-1, 0, .3),
                starting_rot=(0, 0, 0),
                is_main_agent=True) for i in range(self._numSlaves)
        ]
        self._env = HolodeckEnvironment(agent_definitions=agents,
                                        start_world=False,
                                        ticks_per_sec=tps)
        # self._env = holodeck.make("PPO")
        # self._env.should_render_viewport(False)

        self._stateSize = 18 * 3 + 18 * 3 + 5 * 3 + 5 * 3 + 1
        self._rewardSize = 5
        self._eoeSize = 2
        self._actionSize = 18 * 3

        # initialize networks
        self._policy = Policy(self._actionSize)
        self._policy.build(self._stateSize)
        self._valueFunction = ValueFunction()
        self._valueFunction.build(self._stateSize)

        # initialize RunningMeanStd
        self._rms = RunningMeanStd(shape=(self._stateSize))

        # initialize replay buffer
        self._replayBuffer = ReplayBuffer()

        self._policyOptimizer = tf.keras.optimizers.Adam(
            learning_rate=self.decayedLearningRatePolicy)
        self._valueFunctionOptimizer = tf.keras.optimizers.Adam(
            learning_rate=self._learningRateValueFunction)

        # initialize saver
        # self._saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=1)
        # save maximum step network
        self._smax = 0
        # save maximum reward network
        self._rmax = 0

        # initialize statistics variables
        # TODO
        self._summary_num_log = 0
        self._summary_num_episodes_total = 0
        self._summary_num_transitions_total = 0

        self._summary_max_episode_length = 0

        self._summary_total_rewards = []
        self._summary_total_rewards_by_parts = np.array([[]] * 5)
        self._summary_mean_rewards = []
        self._summary_transition_per_episodes = []
        self._summary_noise_records = []

        self._summary_evaluation_total_rewards = []
        self._summary_evaluation_total_rewards_by_parts = np.array([[]] * 5)
        self._summary_evaluation_mean_rewards = []
        self._summary_evaluation_transition_per_episodes = []

        # initialize checkpoint
        self._ckpt = tf.train.Checkpoint(
            policy_mean=self._policy.mean,
            policy_logstd=self._policy.logstd,
            valueFunction=self._valueFunction.value
            # policyOptimizer=self._policyOptimizer,
            # valueFunctionOptimizer=self._valueFunctionOptimizer
        )

        self._isNetworkLoaded = False
        self._loadedNetwork = ""
示例#5
0
class TrackingController:
    def __init__(self):
        random.seed(int(time.time()))
        np.random.seed(int(time.time()))

        self._startTime = time.time()
        self._summary_sim_time = 0
        self._summary_train_time = 0

        self._timeChecker = util.TimeChecker()

    def initialize(self,
                   session_name="default_session",
                   num_slaves=8,
                   tps=10000,
                   use_evaluation=False):

        # get parameters from config
        self._numSlaves = num_slaves

        self._gamma = 0.99
        self._lambd = 0.95
        self._clipRange = 0.2

        self._learningRatePolicy = 1e-4
        self._learningRatePolicyDecay = 0.9993
        self._learningRateValueFunction = 1e-3

        self._batchSize = 1024
        self._transitionsPerIteration = 20000

        # if useEvaluation is true, evaluation of training progress is performed by evaluation function, else it is done by transitions collected in training session.
        self._useEvaluation = use_evaluation

        self._sessionName = session_name

        # initialize environment
        # TODO
        agents = [
            holodeck.agents.AgentDefinition(
                agent_name="android" + str(i),
                agent_type=holodeck.agents.AndroidAgent,
                sensors=[holodeck.sensors.CustomSensor],
                starting_loc=(-1, 0, .3),
                starting_rot=(0, 0, 0),
                is_main_agent=True) for i in range(self._numSlaves)
        ]
        self._env = HolodeckEnvironment(agent_definitions=agents,
                                        start_world=False,
                                        ticks_per_sec=tps)
        # self._env = holodeck.make("PPO")
        # self._env.should_render_viewport(False)

        self._stateSize = 18 * 3 + 18 * 3 + 5 * 3 + 5 * 3 + 1
        self._rewardSize = 5
        self._eoeSize = 2
        self._actionSize = 18 * 3

        # initialize networks
        self._policy = Policy(self._actionSize)
        self._policy.build(self._stateSize)
        self._valueFunction = ValueFunction()
        self._valueFunction.build(self._stateSize)

        # initialize RunningMeanStd
        self._rms = RunningMeanStd(shape=(self._stateSize))

        # initialize replay buffer
        self._replayBuffer = ReplayBuffer()

        self._policyOptimizer = tf.keras.optimizers.Adam(
            learning_rate=self.decayedLearningRatePolicy)
        self._valueFunctionOptimizer = tf.keras.optimizers.Adam(
            learning_rate=self._learningRateValueFunction)

        # initialize saver
        # self._saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=1)
        # save maximum step network
        self._smax = 0
        # save maximum reward network
        self._rmax = 0

        # initialize statistics variables
        # TODO
        self._summary_num_log = 0
        self._summary_num_episodes_total = 0
        self._summary_num_transitions_total = 0

        self._summary_max_episode_length = 0

        self._summary_total_rewards = []
        self._summary_total_rewards_by_parts = np.array([[]] * 5)
        self._summary_mean_rewards = []
        self._summary_transition_per_episodes = []
        self._summary_noise_records = []

        self._summary_evaluation_total_rewards = []
        self._summary_evaluation_total_rewards_by_parts = np.array([[]] * 5)
        self._summary_evaluation_mean_rewards = []
        self._summary_evaluation_transition_per_episodes = []

        # initialize checkpoint
        self._ckpt = tf.train.Checkpoint(
            policy_mean=self._policy.mean,
            policy_logstd=self._policy.logstd,
            valueFunction=self._valueFunction.value
            # policyOptimizer=self._policyOptimizer,
            # valueFunctionOptimizer=self._valueFunctionOptimizer
        )

        self._isNetworkLoaded = False
        self._loadedNetwork = ""

    def decayedLearningRatePolicy(self):
        return self._learningRatePolicy

    # load trained networks & rms
    def loadNetworks(self, directory, network_type=None):
        # load rms
        rms_dir = "{}/rms/".format(directory)
        if (network_type is None) or (network_type == ""):
            mean_dir = rms_dir + "mean.npy"
            var_dir = rms_dir + "var.npy"
        else:
            mean_dir = rms_dir + "mean_{}.npy".format(network_type)
            var_dir = rms_dir + "var_{}.npy".format(network_type)

        if os.path.exists(mean_dir):
            print("Loading RMS parameters")
            self._rms.mean = np.load(mean_dir)
            self._rms.var = np.load(var_dir)
            self._rms.count = 200000000

        # load netowrk
        if network_type is not None:
            network_dir = "{}/network-{}".format(directory, network_type)
        else:
            network_dir = "{}/network".format(directory)
        print("Loading networks from {}".format(network_dir))

        self.restore(network_dir)

        self._isNetworkLoaded = True
        self._loadedNetwork = "{}".format(network_dir)

    def computeTDAndGAE(self):
        self._collectedStates = [
            None
        ] * self._summary_num_transitions_per_iteration
        self._collectedActions = [
            None
        ] * self._summary_num_transitions_per_iteration
        self._collectedNeglogprobs = [
            None
        ] * self._summary_num_transitions_per_iteration
        self._collectedTDs = [None
                              ] * self._summary_num_transitions_per_iteration
        self._collectedGAEs = [None
                               ] * self._summary_num_transitions_per_iteration

        startIdx = 0
        for epi in self._collectedEpisodes:
            data = epi.data
            size = len(data)

            # update max episorde length
            if size > self._summary_max_episode_length:
                self._summary_max_episode_length = size

            states, actions, rewards, values, neglogprobs, TDs, GAEs = zip(
                *data)
            values = tf.convert_to_tensor(values).numpy()
            values = np.concatenate((values, [0]), axis=0)
            advantages = np.zeros(size)
            ad_t = 0

            for i in reversed(range(size)):
                delta = rewards[i] + values[i + 1] * self._gamma - values[i]
                ad_t = delta + self._gamma * self._lambd * ad_t
                advantages[i] = ad_t

            TD = values[:size] + advantages
            self._collectedStates[startIdx:startIdx + size] = list(states)
            self._collectedActions[startIdx:startIdx + size] = list(actions)
            self._collectedNeglogprobs[startIdx:startIdx +
                                       size] = list(neglogprobs)
            self._collectedTDs[startIdx:startIdx + size] = list(TD)
            self._collectedGAEs[startIdx:startIdx + size] = list(advantages)

            startIdx += size

        self._collectedStates = np.array(self._collectedStates,
                                         dtype=np.float32)
        self._collectedActions = tf.convert_to_tensor(
            self._collectedActions).numpy()
        self._collectedNeglogprobs = tf.convert_to_tensor(
            self._collectedNeglogprobs).numpy()
        self._collectedTDs = np.array(self._collectedTDs, dtype=np.float32)
        self._collectedGAEs = np.array(self._collectedGAEs, dtype=np.float32)

    def optimize(self):
        self.computeTDAndGAE()
        if len(self._collectedStates) < self._batchSize:
            return

        GAE = np.array(self._collectedGAEs)
        GAE = (GAE - GAE.mean()) / (GAE.std() + 1e-5)

        ind = np.arange(len(GAE))

        np.random.shuffle(ind)

        for s in range(int(len(ind) // self._batchSize)):
            selectedIndex = ind[s * self._batchSize:(s + 1) * self._batchSize]

            selectedStates = tf.convert_to_tensor(
                self._collectedStates[selectedIndex])
            selectedActions = tf.convert_to_tensor(
                self._collectedActions[selectedIndex])
            selectedNeglogprobs = tf.convert_to_tensor(
                self._collectedNeglogprobs[selectedIndex])
            selectedTDs = tf.convert_to_tensor(
                self._collectedTDs[selectedIndex])
            selectedGAEs = tf.convert_to_tensor(GAE[selectedIndex])

            self.optimizeStep(selectedActions, selectedStates,
                              selectedNeglogprobs, selectedTDs, selectedGAEs)

    def optimizeStep(self, a, s, nl, td, gae):
        with tf.GradientTape() as tape:
            curNeglogprob = self._policy.neglogprob(a, s)
            ratio = tf.exp(nl - curNeglogprob)
            clippedRatio = tf.clip_by_value(ratio, 1.0 - self._clipRange,
                                            1.0 + self._clipRange)
            policyLoss = -tf.reduce_mean(
                tf.minimum(ratio * gae, clippedRatio * gae))

        gradients = tape.gradient(policyLoss,
                                  self._policy.trainable_variables())
        gradients, _grad_norm = tf.clip_by_global_norm(gradients, 0.5)
        self._policyOptimizer.apply_gradients(
            zip(gradients, self._policy.trainable_variables()))

        # optimize value function
        with tf.GradientTape() as tape:
            valueLoss = tf.reduce_mean(
                tf.square(self._valueFunction.getValue(s) - td))
        gradients = tape.gradient(
            valueLoss, self._valueFunction._value.trainable_variables)
        gradients, _grad_norm = tf.clip_by_global_norm(gradients, 0.5)
        self._valueFunctionOptimizer.apply_gradients(
            zip(gradients, self._valueFunction._value.trainable_variables))

    def reset(self):
        return

    def act(self, index, action):
        self._env.act("android" + str(index), action)

    def step(self, actions):
        for _ in range(20):
            for i in range(self._numSlaves):
                self.act(i, actions[i])
            res = self._env.tick()

        states = []
        rewards = []
        eoes = []
        for i in range(self._numSlaves):
            s = res["android" + str(i)]["CustomSensor"]
            states.append(s[:self._stateSize])
            rewards.append(s[self._stateSize:self._stateSize +
                             self._rewardSize])
            eoes.append(s[self._stateSize + self._rewardSize:])

        return states, rewards, eoes

    def runTraining(self, num_iteration=1):
        # create logging directory
        if not os.path.exists("output/"):
            os.mkdir("output/")
        self._directory = 'output/' + self._sessionName + '/'

        if not os.path.exists(self._directory):
            os.mkdir(self._directory)

        directory = self._directory + "rms/"
        if not os.path.exists(directory):
            os.mkdir(directory)

        directory = directory + "cur/"
        if not os.path.exists(directory):
            os.mkdir(directory)

        self.printParameters()

        while True:
            print("\nTraining start")
            self._summary_num_episodes_per_epoch = 0
            self._summary_num_transitions_per_epoch = 0
            self._summary_reward_per_epoch = 0
            self._summary_reward_by_part_per_epoch = []
            self._summary_max_episode_length = 0

            for it in range(num_iteration):
                self._summary_sim_time -= time.time()
                self._collectedEpisodes = []

                nan_count = 0

                # TODO : implement reset
                actions = [None] * self._numSlaves
                for i in range(self._numSlaves):
                    actions[i] = [1, random.random()]
                next_states, _, _ = self.step(actions)

                rewards = [None] * self._numSlaves
                episodes = [None] * self._numSlaves

                terminated = [False] * self._numSlaves
                resetRequired = [False] * self._numSlaves

                for j in range(self._numSlaves):
                    episodes[j] = Episode()

                self._summary_num_transitions_per_iteration = 0
                last_print = 0
                while True:
                    # get states
                    states = np.array(next_states)
                    states_for_update = states[~np.array(terminated)]
                    states_for_update = self._rms.apply(states_for_update)
                    states[~np.array(terminated)] = states_for_update

                    # set action
                    actions, logprobs = self._policy.getActionAndNeglogprob(
                        states)
                    values = self._valueFunction.getValue(states)

                    action_with_reset_signal = [None] * self._numSlaves
                    for j in range(self._numSlaves):
                        action_with_reset_signal[j] = [
                            0, 0
                        ] + actions[j].numpy().tolist()
                        if resetRequired[j]:
                            action_with_reset_signal[j][0] = 1
                            action_with_reset_signal[j][1] = random.random()

                    # run one step
                    next_states, r, e = self.step(action_with_reset_signal)

                    for j in range(self._numSlaves):
                        if terminated[j]:
                            continue

                        is_terminal = e[j][0] > 0.5 and True or False
                        nan_occur = e[j][1] > 0.5 and True or False
                        # push tuples only if nan did not occur
                        if nan_occur is not True:
                            if resetRequired[j]:
                                resetRequired[j] = False
                            else:
                                rewards[j] = r[j][0]
                                self._summary_reward_per_epoch += rewards[j]
                                self._summary_reward_by_part_per_epoch.append(
                                    r[j])
                                episodes[j].push(states[j], actions[j],
                                                 rewards[j], values[j],
                                                 logprobs[j])
                                self._summary_num_transitions_per_iteration += 1
                        else:
                            nan_count += 1

                        # if episode is terminated
                        if is_terminal:
                            # push episodes
                            if len(episodes[j].data) != 0:
                                self._collectedEpisodes.append(episodes[j])

                            if self._summary_num_transitions_per_iteration < self._transitionsPerIteration:
                                episodes[j] = Episode()
                                resetRequired[j] = True
                            else:
                                terminated[j] = True

                    # if local step exceeds t_p_i: wait for others to terminate
                    if self._summary_num_transitions_per_iteration >= self._transitionsPerIteration:
                        if all(t is True for t in terminated):
                            print('\r{}/{} : {}/{}'.format(
                                it + 1, num_iteration,
                                self._summary_num_transitions_per_iteration,
                                self._transitionsPerIteration),
                                  end='')
                            break

                    # print progress per 100 steps
                    if last_print + 100 < self._summary_num_transitions_per_iteration:
                        print('\r{}/{} : {}/{}'.format(
                            it + 1, num_iteration,
                            self._summary_num_transitions_per_iteration,
                            self._transitionsPerIteration),
                              end='')
                        last_print = self._summary_num_transitions_per_iteration

                self._summary_sim_time += time.time()
                self._summary_train_time -= time.time()

                # optimization
                print('')
                if (nan_count > 0):
                    print("nan_count : {}".format(nan_count))

                self._summary_num_episodes_per_epoch += len(
                    self._collectedEpisodes)
                self._summary_num_transitions_per_epoch += self._summary_num_transitions_per_iteration

                self.optimize()  ##SM) after getting all tuples, optimize once

                self._summary_train_time += time.time()

            # decay learning rate
            if self._learningRatePolicy > 1e-5:
                self._learningRatePolicy = self._learningRatePolicy * self._learningRatePolicyDecay

            print('Training end\n')

            self._summary_total_rewards.append(
                self._summary_reward_per_epoch /
                self._summary_num_episodes_per_epoch)
            self._summary_total_rewards_by_parts = np.insert(
                self._summary_total_rewards_by_parts,
                self._summary_total_rewards_by_parts.shape[1],
                np.asarray(self._summary_reward_by_part_per_epoch).sum(axis=0)
                / self._summary_num_episodes_per_epoch,
                axis=1)
            self._summary_mean_rewards.append(
                np.asarray(self._summary_total_rewards)[-10:].mean())
            self._summary_noise_records.append(
                self._policy.std().numpy().mean())

            self._summary_num_episodes_total += self._summary_num_episodes_per_epoch
            self._summary_num_transitions_total += self._summary_num_transitions_per_epoch
            t_per_e = 0
            if self._summary_num_episodes_per_epoch is not 0:
                t_per_e = self._summary_num_transitions_per_epoch / self._summary_num_episodes_per_epoch
            self._summary_transition_per_episodes.append(t_per_e)

            # print summary
            self.printSummary()

    def play(self):
        # create logging directory
        if not os.path.exists("output/"):
            os.mkdir("output/")
        self._directory = 'output/' + self._sessionName + '/'

        if not os.path.exists(self._directory):
            os.mkdir(self._directory)

        directory = self._directory + "rms/"
        if not os.path.exists(directory):
            os.mkdir(directory)

        directory = directory + "cur/"
        if not os.path.exists(directory):
            os.mkdir(directory)
        self.printParameters()

        actions = [None] * self._numSlaves
        for i in range(self._numSlaves):
            actions[i] = [1, 0.0]
        next_states, _, _ = self.step(actions)

        rewards = [None] * self._numSlaves
        episodes = [None] * self._numSlaves

        terminated = [False] * self._numSlaves
        resetRequired = [False] * self._numSlaves

        last_print = 0
        while True:
            # get states
            states = np.array(next_states)
            states_for_update = states[~np.array(terminated)]
            states_for_update = self._rms.apply(states_for_update)
            states[~np.array(terminated)] = states_for_update

            # set action
            if self._isNetworkLoaded:
                # actions, _ = self._policy.getActionAndNeglogprob(states)
                actions = self._policy.getMeanAction(states)
            else:
                actions = np.zeros(shape=(self._numSlaves, self._actionSize))

            action_with_reset_signal = [None] * self._numSlaves
            for j in range(self._numSlaves):
                action_with_reset_signal[j] = [0, 0] + np.array(
                    actions[j]).tolist()
                if resetRequired[j]:
                    action_with_reset_signal[j][0] = 1
                    action_with_reset_signal[j][1] = random.random()

            # run one step
            next_states, r, e = self.step(action_with_reset_signal)

            for j in range(self._numSlaves):

                is_terminal = e[j][0] > 0.5 and True or False
                nan_occur = e[j][1] > 0.5 and True or False
                # push tuples only if nan did not occur
                if nan_occur is not True:
                    if resetRequired[j]:
                        resetRequired[j] = False

                # if episode is terminated
                if is_terminal:
                    resetRequired[j] = True

        # optimization
        print('')

    def printParameters(self):
        # print on shell
        print(
            "===============================================================")
        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        print("Elapsed time         : {:.2f}s".format(time.time() -
                                                      self._startTime))
        print("Session Name         : {}".format(self._sessionName))
        print("Slaves number        : {}".format(self._numSlaves))
        print("State size           : {}".format(self._stateSize))
        print("Action size          : {}".format(self._actionSize))
        print("Learning rate        : {:.6f}".format(self._learningRatePolicy))
        print("Gamma                : {}".format(self._gamma))
        print("Lambda               : {}".format(self._lambd))
        print("Batch size           : {}".format(self._batchSize))
        print("Transitions per iter : {}".format(
            self._transitionsPerIteration))
        print("PPO clip range       : {}".format(self._clipRange))
        print("Loaded netowrks      : {}".format(self._loadedNetwork))
        print(
            "===============================================================")

        # print to file
        out = open(self._directory + "parameters", "w")
        out.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S\n"))
        out.write("Session Name         : {}\n".format(self._sessionName))
        out.write("Slaves number        : {}\n".format(self._numSlaves))
        out.write("State size           : {}\n".format(self._stateSize))
        out.write("Action size          : {}\n".format(self._actionSize))
        out.write("Learning rate        : {:.6f}\n".format(
            self._learningRatePolicy))
        out.write("Gamma                : {}\n".format(self._gamma))
        out.write("Lambda               : {}\n".format(self._lambd))
        out.write("Batch size           : {}\n".format(self._batchSize))
        out.write("Transitions per iter : {}\n".format(
            self._transitionsPerIteration))
        out.write("PPO clip range       : {}\n".format(self._clipRange))
        out.write("Loaded netowrks      : {}\n".format(self._loadedNetwork))
        out.close()

        # pre make results file
        out = open(self._directory + "results", "w")
        out.close()

    def printSummary(self):
        np.save(self._directory + "rms/mean.npy".format(self._summary_num_log),
                self._rms.mean)
        np.save(self._directory + "rms/var.npy".format(self._summary_num_log),
                self._rms.var)

        print(
            '===============================================================')
        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        print("Elapsed time         : {:.2f}s".format(time.time() -
                                                      self._startTime))
        print("Simulation time      : {}s".format(self._summary_sim_time))
        print("Training time        : {}s".format(self._summary_train_time))
        print("Session Name         : {}".format(self._sessionName))
        print("Logging Count        : {}".format(self._summary_num_log))
        print('Noise                : {:.3f}'.format(
            self._summary_noise_records[-1]))
        print('Learning rate        : {:.6f}'.format(self._learningRatePolicy))
        print('Total episode        : {}'.format(
            self._summary_num_episodes_total))
        print('Total trans          : {}'.format(
            self._summary_num_transitions_total))
        total_t_per_e = 0
        if self._summary_num_episodes_total is not 0:
            total_t_per_e = self._summary_num_transitions_total / self._summary_num_episodes_total
        print('Total trans per epi  : {:.2f}'.format(total_t_per_e))
        print('Episode              : {}'.format(
            self._summary_num_episodes_per_epoch))
        print('Transition           : {}'.format(
            self._summary_num_transitions_per_epoch))
        print('Trans per epi        : {:.2f}'.format(
            self._summary_transition_per_episodes[-1]))
        print('Max episode length   : {}'.format(
            self._summary_max_episode_length))
        print('Rewards per episodes : {:.2f}'.format(
            self._summary_total_rewards[-1]))

        print(
            '===============================================================')

        # print plot
        y_list = [[np.asarray(self._summary_total_rewards_by_parts[0]), 'r'],
                  [np.asarray(self._summary_mean_rewards), 'r_mean'],
                  [np.asarray(self._summary_transition_per_episodes), 'steps'],
                  [np.asarray(self._summary_total_rewards_by_parts[1]), 'p'],
                  [np.asarray(self._summary_total_rewards_by_parts[2]), 'v'],
                  [np.asarray(self._summary_total_rewards_by_parts[3]), 'com'],
                  [np.asarray(self._summary_total_rewards_by_parts[4]), 'ee']]
        Plot(y_list, self._sessionName, 1, path=self._directory + "result.png")

        for i in range(len(y_list)):
            y_list[i][0] = np.array(y_list[i][0]) / np.array(
                self._summary_transition_per_episodes)
        y_list[1][0] = np.asarray(self._summary_noise_records)
        y_list[1][1] = 'noise'

        Plot(y_list,
             self._sessionName + "_per_step",
             2,
             path=self._directory + "result_per_step.png")

        # log to file
        out = open(self._directory + "results", "a")
        out.write(
            '===============================================================\n'
        )
        out.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S\n"))
        out.write("Elapsed time         : {:.2f}s\n".format(time.time() -
                                                            self._startTime))
        out.write("Simulation time      : {}s\n".format(
            self._summary_sim_time))
        out.write("Training time        : {}s\n".format(
            self._summary_train_time))
        out.write("Session Name         : {}\n".format(self._sessionName))
        out.write("Logging Count        : {}\n".format(self._summary_num_log))
        out.write('Noise                : {:.3f}\n'.format(
            self._summary_noise_records[-1]))
        out.write('Learning rate        : {:.6f}\n'.format(
            self._learningRatePolicy))
        out.write('Total episode        : {}\n'.format(
            self._summary_num_episodes_total))
        out.write('Total trans          : {}\n'.format(
            self._summary_num_transitions_total))
        out.write('Total trans per epi  : {:.2f}\n'.format(total_t_per_e))
        out.write('Episode              : {}\n'.format(
            self._summary_num_episodes_per_epoch))
        out.write('Transition           : {}\n'.format(
            self._summary_num_transitions_per_epoch))
        out.write('Trans per epi        : {:.2f}\n'.format(
            self._summary_transition_per_episodes[-1]))
        out.write('Max episode length   : {}\n'.format(
            self._summary_max_episode_length))
        out.write('Rewards per episodes : {:.2f}\n'.format(
            self._summary_total_rewards[-1]))

        out.write(
            '===============================================================\n'
        )
        out.close()

        # save network
        self.save(self._directory + "network")

        t_per_e = self._summary_transition_per_episodes[-1]
        tr = self._summary_total_rewards[-1]

        if t_per_e > self._smax:
            self._smax = t_per_e
            np.save(self._directory + "rms/mean_smax.npy", self._rms.mean)
            np.save(self._directory + "rms/var_smax.npy", self._rms.var)

            os.system(
                str(
                    Path(
                        "copy {}/network.data-00000-of-00001 {}/network-smax.data-00000-of-00001"
                        .format(self._directory, self._directory))))
            os.system(
                str(
                    Path(
                        "copy {}/network.data-00000-of-00002 {}/network-smax.data-00000-of-00002"
                        .format(self._directory, self._directory))))
            os.system(
                str(
                    Path(
                        "copy {}/network.data-00001-of-00002 {}/network-smax.data-00001-of-00002"
                        .format(self._directory, self._directory))))
            os.system(
                str(
                    Path("copy {}/network.index {}/network-smax.index".format(
                        self._directory, self._directory))))

        if tr > self._rmax:
            self._rmax = tr
            np.save(self._directory + "rms/mean_rmax.npy", self._rms.mean)
            np.save(self._directory + "rms/var_rmax.npy", self._rms.var)

            os.system(
                str(
                    Path(
                        "copy {}/network.data-00000-of-00001 {}/network-rmax.data-00000-of-00001"
                        .format(self._directory, self._directory))))
            os.system(
                str(
                    Path(
                        "copy {}/network.data-00000-of-00002 {}/network-rmax.data-00000-of-00002"
                        .format(self._directory, self._directory))))
            os.system(
                str(
                    Path(
                        "copy {}/network.data-00001-of-00002 {}/network-rmax.data-00001-of-00002"
                        .format(self._directory, self._directory))))
            os.system(
                str(
                    Path("copy {}/network.index {}/network-rmax.index".format(
                        self._directory, self._directory))))

        self._summary_num_log = self._summary_num_log + 1

        return

    def save(self, path):
        self._ckpt.write(path)

    def restore(self, path):
        self._ckpt.restore(path)
class MultiLayerPolicy:
    def __init__(self,
                 name,
                 ob,
                 ac_shape,
                 hid_size=128,
                 num_hid_layers=3,
                 reuse=False):
        with tf.variable_scope(name, reuse):
            self.scope = tf.get_variable_scope().name
            self.build_net(ob, ac_shape, hid_size, num_hid_layers)

    def build_net(self, ob, ac_shape, hid_size, num_hid_layers):
        self.ob = ob
        self.ob_shape = ob.shape.as_list()[1:]

        with tf.variable_scope("ob_filter"):
            self.ob_rms = RunningMeanStd(ob.shape.as_list()[1:])

        # normalized observation
        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)

        # net to fit value function
        net = obz
        for i in range(num_hid_layers):
            net = tf.layers.dense(
                inputs=net,
                units=hid_size,
                activation=tf.nn.tanh,
                kernel_initializer=tf.random_normal_initializer(mean=0,
                                                                stddev=1),
                name="vffc%i" % (i + 1))
        self.vpred = tf.layers.dense(
            inputs=net,
            units=1,
            activation=None,
            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1),
            name="vffinal")
        # train value function
        self.vreal = tf.placeholder(dtype=tf.float32,
                                    shape=(None, ),
                                    name="vreal")
        vloss = tf.reduce_mean(tf.square(self.vreal - self.vpred))
        valueFunctionVars = [
            v for v in self.get_trainable_variables()
            if v.name.startswith("%s/vff" % self.scope)
        ]
        self.vadam = tf.train.AdamOptimizer().minimize(
            vloss, var_list=valueFunctionVars)

        # net to predict mean and standard deviation of action
        net = obz
        for i in range(num_hid_layers):
            net = tf.layers.dense(
                inputs=net,
                units=hid_size,
                activation=tf.nn.tanh,
                kernel_initializer=tf.random_normal_initializer(mean=0,
                                                                stddev=1),
                name="polc%i" % (i + 1))
        mean = tf.layers.dense(inputs=net,
                               units=ac_shape[0],
                               activation=None,
                               kernel_initializer=tf.random_normal_initializer(
                                   mean=0, stddev=0.01))
        logstd = mean * 0.0 + tf.get_variable(
            name="logstd",
            shape=[1, ac_shape[0]],
            initializer=tf.zeros_initializer(),
            dtype=tf.float32)  # std not related to observation

        # action is normally distributed
        self.pd = DiagGaussianPd(mean, logstd)
        self.stochastic = tf.placeholder(dtype=tf.bool,
                                         shape=(),
                                         name="stochastic")
        self.action = tf.cond(self.stochastic, lambda: self.pd.sample(),
                              lambda: self.pd.mode())

    def act(self, stochastic, ob):
        action, vpred = tf.get_default_session().run(
            [self.action, self.vpred], {
                self.ob: ob[None],
                self.stochastic: stochastic
            })
        return action[0], vpred[0]

    def train_value_function(self, obs, vreals):
        self.ob_rms.update(obs)
        tf.get_default_session().run([self.vadam], {
            self.ob: obs,
            self.vreal: vreals
        })

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
    def build_net(self, ob, ac_shape, hid_size, num_hid_layers):
        self.ob = ob
        self.ob_shape = ob.shape.as_list()[1:]

        with tf.variable_scope("ob_filter"):
            self.ob_rms = RunningMeanStd(ob.shape.as_list()[1:])

        # normalized observation
        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)

        # net to fit value function
        net = obz
        for i in range(num_hid_layers):
            net = tf.layers.dense(
                inputs=net,
                units=hid_size,
                activation=tf.nn.tanh,
                kernel_initializer=tf.random_normal_initializer(mean=0,
                                                                stddev=1),
                name="vffc%i" % (i + 1))
        self.vpred = tf.layers.dense(
            inputs=net,
            units=1,
            activation=None,
            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1),
            name="vffinal")
        # train value function
        self.vreal = tf.placeholder(dtype=tf.float32,
                                    shape=(None, ),
                                    name="vreal")
        vloss = tf.reduce_mean(tf.square(self.vreal - self.vpred))
        valueFunctionVars = [
            v for v in self.get_trainable_variables()
            if v.name.startswith("%s/vff" % self.scope)
        ]
        self.vadam = tf.train.AdamOptimizer().minimize(
            vloss, var_list=valueFunctionVars)

        # net to predict mean and standard deviation of action
        net = obz
        for i in range(num_hid_layers):
            net = tf.layers.dense(
                inputs=net,
                units=hid_size,
                activation=tf.nn.tanh,
                kernel_initializer=tf.random_normal_initializer(mean=0,
                                                                stddev=1),
                name="polc%i" % (i + 1))
        mean = tf.layers.dense(inputs=net,
                               units=ac_shape[0],
                               activation=None,
                               kernel_initializer=tf.random_normal_initializer(
                                   mean=0, stddev=0.01))
        logstd = mean * 0.0 + tf.get_variable(
            name="logstd",
            shape=[1, ac_shape[0]],
            initializer=tf.zeros_initializer(),
            dtype=tf.float32)  # std not related to observation

        # action is normally distributed
        self.pd = DiagGaussianPd(mean, logstd)
        self.stochastic = tf.placeholder(dtype=tf.bool,
                                         shape=(),
                                         name="stochastic")
        self.action = tf.cond(self.stochastic, lambda: self.pd.sample(),
                              lambda: self.pd.mode())
示例#8
0
class PPO_RND(BaseAlgorithm):
    """
    Base algorithm class that each agent has to inherit from.
    :param env_id: (str)            name of environment to perform training on
    :param lr: (float)              learning rate
    :param int_lr: (float)          intrinsic learning rate
    :param nstep: (int)             storage rollout steps
    :param batch_size: (int)        batch size for training
    :param n_epochs: (int)          number of training epochs
    :param gamma: (float)           discount factor
    :param int_gamma: (float)       discount factor for intrinsic rewards
    :param gae_lam: (float)         lambda for generalized advantage estimation
    :param clip_range: (float)      clip range for surrogate loss
    :param ent_coef: (float)        entropy loss coefficient
    :param vf_coef: (float)         value loss coefficient
    :param int_vf_coef: (float)     intrinsic value loss coefficient
    :param max_grad_norm: (float)   max grad norm for optimizer
    :param hidden_size: (int)       size of the hidden layers of policy
    :param int_hidden_size: (int)   size of the hidden layers for the RND target and predictor networks
    :param rnd_start: (int)         the number of initial steps to normalize observations
    """
    def __init__(self,
                 *,
                 env_id,
                 lr=3e-4,
                 nstep=128,
                 batch_size=128,
                 n_epochs=10,
                 gamma=0.99,
                 int_gamma=0.99,
                 gae_lam=0.95,
                 clip_range=0.2,
                 ent_coef=.01,
                 vf_coef=0.5,
                 int_vf_coef=0.5,
                 max_grad_norm=0.2,
                 hidden_size=128,
                 int_hidden_size=128,
                 int_lr=3e-4,
                 rnd_start=1e+3):
        super(PPO_RND, self).__init__(env_id, lr, nstep, batch_size, n_epochs,
                                      gamma, gae_lam, clip_range, ent_coef,
                                      vf_coef, max_grad_norm)

        self.policy = Policy(self.env, hidden_size, intrinsic_model=True)
        self.rnd = RndNetwork(self.state_dim, hidden_size=int_hidden_size)
        self.rollout = IntrinsicStorage(nstep,
                                        self.num_envs,
                                        self.env.observation_space,
                                        self.env.action_space,
                                        gae_lam=gae_lam,
                                        gamma=gamma,
                                        int_gamma=int_gamma)
        self.optimizer = optim.Adam(self.policy.net.parameters(), lr=lr)
        self.rnd_optimizer = optim.Adam(self.rnd.parameters(), lr=int_lr)

        self.rnd_start = rnd_start
        self.int_vf_coef = int_vf_coef

        self.last_obs = self.env.reset()

        self.int_rew_rms = RunningMeanStd()

        self.normalize = True
        self.last_dones = np.array([0 for _ in range(self.num_envs)])

    def collect_samples(self):
        """
        Collect one full rollout, as determined by the nstep parameter, and add it to the buffer
        """
        assert self.last_obs is not None
        rollout_step = 0
        self.rollout.reset()

        while rollout_step < self.nstep:
            with torch.no_grad():
                actions, values, int_values, log_probs = self.policy.act(
                    self.last_obs)

            actions = actions.numpy()
            obs, rewards, dones, infos = self.env.step(actions)
            if any(dones):
                self.num_episodes += sum(dones)

            self.num_timesteps += self.num_envs
            self.update_info_buffer(infos)

            actions = actions.reshape(self.num_envs,
                                      self.action_converter.action_output)
            log_probs = log_probs.reshape(self.num_envs,
                                          self.action_converter.action_output)

            if (self.num_timesteps / self.num_envs) < self.rnd_start:
                int_rewards = np.zeros_like(rewards)
                self.obs_rms.update(self.env.unnormalize_obs(self.last_obs))
            else:
                normalized_obs = self.normalize_obs(obs)
                int_rewards = self.rnd.int_reward(
                    normalized_obs).detach().numpy()
                self.int_rew_rms.update(int_rewards)

                int_rewards /= (np.sqrt(self.int_rew_rms.var) + 1e-08)

            self.rollout.add(self.last_obs, actions, rewards, int_rewards,
                             values, int_values, dones, log_probs)
            self.last_obs = obs
            self.last_dones = dones
            rollout_step += 1

        self.rollout.compute_returns_and_advantages(values, int_values, dones)

        return True

    def train(self):
        """
        Use the collected data from the buffer to train the policy network
        """
        total_losses, policy_losses, value_losses, entropy_losses, intrinsic_losses = [], [], [], [], []
        rnd_trained = False
        for epoch in range(self.n_epochs):
            for batch in self.rollout.get(self.batch_size):
                observations = batch.observations
                actions = batch.actions
                old_log_probs = batch.old_log_probs
                old_values = batch.old_values
                old_int_values = batch.int_values
                advantages = batch.advantages
                int_advantages = batch.int_advantages
                returns = batch.returns
                int_returns = batch.int_returns

                # Get values and action probabilities using the updated policy on gathered observations
                state_values, int_values, action_log_probs, entropy = self.policy.evaluate(
                    observations, actions)

                # Normalize batch advantages
                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)
                int_advantages = (int_advantages - int_advantages.mean()) / (
                    int_advantages.std() + 1e-8)

                advantages = advantages + int_advantages

                # Compute policy gradient ratio of current actions probs over previous
                ratio = torch.exp(action_log_probs - old_log_probs)

                # Compute surrogate loss
                surr_loss_1 = advantages * ratio
                surr_loss_2 = advantages * torch.clamp(
                    ratio, 1 - self.clip_range, 1 + self.clip_range)
                policy_loss = -torch.min(surr_loss_1, surr_loss_2).mean()

                # Clip state values for stability
                state_values_clipped = old_values + (
                    state_values - old_values).clamp(-self.clip_range,
                                                     self.clip_range)
                value_loss = F.mse_loss(returns, state_values).mean()
                value_loss_clipped = F.mse_loss(returns,
                                                state_values_clipped).mean()
                value_loss = torch.max(value_loss, value_loss_clipped).mean()

                # Clip state values for stability
                int_values_clipped = old_int_values + (
                    int_values - old_int_values).clamp(-self.clip_range,
                                                       self.clip_range)
                int_value_loss = F.mse_loss(int_returns, int_values).mean()
                int_value_loss_clipped = F.mse_loss(int_returns,
                                                    int_values_clipped).mean()
                int_value_loss = torch.max(int_value_loss,
                                           int_value_loss_clipped).mean()

                # Compute entropy loss
                entropy_loss = -torch.mean(entropy)

                # Total loss
                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss + self.int_vf_coef * int_value_loss

                # Perform optimization
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.policy.net.parameters(),
                                               self.max_grad_norm)
                self.optimizer.step()

                if np.random.randn() < 0.25:
                    self.train_rnd(batch)

                total_losses.append(loss.item())
                policy_losses.append(policy_loss.item())
                value_losses.append(value_loss.item())
                entropy_losses.append(entropy_loss.item())
                intrinsic_losses.append(int_value_loss.item())
            rnd_trained = True

        logger.record("train/intrinsic_loss", np.mean(intrinsic_losses))
        logger.record("train/entropy_loss", np.mean(entropy_losses))
        logger.record("train/policy_gradient_loss", np.mean(policy_losses))
        logger.record("train/value_loss", np.mean(value_losses))
        logger.record("train/total_loss", np.mean(total_losses))

        self._n_updates += self.n_epochs

    def train_rnd(self, batch):
        """
        Train the predictor RND network
        
        :param batch: (np.ndarray) batch from the current experience buffer
        """
        obs = batch.observations  #self.rew_norm_and_clip(batch.observations.numpy())
        obs = self.normalize_obs(obs.numpy())
        pred, target = self.rnd(torch.from_numpy(obs).float())

        loss = F.mse_loss(pred, target)

        self.rnd_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.rnd.parameters(),
                                       self.max_grad_norm)
        self.rnd_optimizer.step()

    def learn(self,
              total_timesteps,
              log_interval,
              reward_target=None,
              log_to_file=False):
        """
        Initiate the training of the algorithm.

        :param total_timesteps: (int)   total number of timesteps the agent is to run for
        :param log_interval: (int)      how often to perform logging
        :param reward_target: (int)     reaching the reward target stops training early
        :param log_to_file: (bool)      specify whether output ought to be logged
        """
        logger.configure("RND", self.env_id, log_to_file)
        start_time = time.time()
        iteration = 0

        while self.num_timesteps < total_timesteps:
            self.collect_samples()

            iteration += 1
            if log_interval is not None and iteration % log_interval == 0:
                logger.record("time/total timesteps", self.num_timesteps)
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        np.mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/num_episodes", self.num_episodes)
                fps = int(self.num_timesteps / (time.time() - start_time))
                logger.record("time/total_time", (time.time() - start_time))
                logger.dump(step=self.num_timesteps)

            self.train()

            if reward_target is not None and np.mean(
                [ep_info["r"]
                 for ep_info in self.ep_info_buffer]) > reward_target:
                logger.record("time/total timesteps", self.num_timesteps)
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        np.mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/num_episodes", self.num_episodes)
                fps = int(self.num_timesteps / (time.time() - start_time))
                logger.record("time/total_time", (time.time() - start_time))
                logger.dump(step=self.num_timesteps)
                break
        return self