示例#1
0
class Policy(nn.Module):
    """Actor (Policy) Model."""
    def __init__(self, policy_params):
        """ arch_parameters is a dictionary like:
        {'state_and_action_dims' : (num1, num2), layers : {'Linear_1' : layer_size_1,..,'Linear_n' : layer_size_n} }
        """
        super(Policy, self).__init__()
        self.policy_params = policy_params
        self.seed_as_int = policy_params['seed']
        torch.manual_seed(self.seed_as_int)
        self.arch_params = policy_params['arch_params']
        self.__state_dim = self.arch_params['state_and_action_dims'][0]
        self.__action_dim = self.arch_params['state_and_action_dims'][1]
        self.eps = policy_params['eps']
        self.min_eps = policy_params['min_eps']
        self.eps_decay = policy_params['eps_decay']
        self.__noise_type = policy_params['noise_type']

        keys = list(self.arch_params['layers'].keys())
        list_of_layers = []

        prev_layer_size = self.__state_dim
        for i in range(len(self.arch_params['layers'])):
            key = keys[i]
            layer_type = key.split('_')[0]
            if layer_type == 'Linear':
                layer_size = self.arch_params['layers'][key]
                list_of_layers.append(nn.Linear(prev_layer_size, layer_size))
                prev_layer_size = layer_size
            elif layer_type == 'LayerNorm':
                list_of_layers.append(nn.LayerNorm(prev_layer_size))
            elif layer_type == 'ReLU':
                list_of_layers.append(nn.ReLU())
            elif layer_type == 'Tanh':
                list_of_layers.append(nn.Tanh())
            else:
                print("Error: got unspecified layer type: '{}'. Check your layers!".format(layer_type))
                break

        self.layers = nn.ModuleList(list_of_layers)

        #noise
        if self.__noise_type == 'action':
            self.__rand_process = OUNoise((self.__action_dim,))

        elif self.__noise_type == 'parameter':
            self.network_params_perturbations = dict()
            for i in range(len(self.layers)):
                if not ('Linear' in str(type(self.layers[i]))):
                    i += 1
                else:
                    self.network_params_perturbations[i] = OUNoise(tuple(self.layers[i].weight.shape))
        else:
            assert ValueError('Got an unspecified type of noise. The only available options are \'parameter\' and \'action\'')

    def forward(self, state):  # get action values
        """Build a network that maps state -> action."""


        if self.__noise_type == 'action':
            y = state.float()
            for i in range(len(self.layers)):
                y = self.layers[i](y).float()
            y_perturbed = y + self.eps*torch.from_numpy(self.__rand_process.noise()).float()
            return y, torch.clamp(y_perturbed, min = -1.0, max = 1.0)

        elif self.__noise_type == 'parameter':
            y = state.float()
            y_perturbed = state.float()
            for i in range(len(self.layers)):
                if not ('Linear' in str(type(self.layers[i]))):
                    y = self.layers[i](y).float() #layernorm
                    y_perturbed = self.layers[i](y_perturbed).float()
                else:
                    #weights
                    if (self.layers[i].weight).shape[1] == y.shape[0]: #if there is a single state
                        y = (self.layers[i].weight).matmul(y)
                        n = torch.from_numpy(self.network_params_perturbations[i].noise()).float()
                        y_perturbed = ((self.layers[i].weight) + self.eps*n).matmul(y_perturbed)
                    else:                                              #if there is a batch of states
                        y = y.matmul((self.layers[i].weight).t())
                        n = torch.from_numpy(self.network_params_perturbations[i].noise()).float()
                        y_perturbed = y_perturbed.matmul(((self.layers[i].weight) + self.eps*n).t())
                    #biases
                    y = y + (self.layers[i].bias)
                    y_perturbed = y_perturbed + (self.layers[i].bias)
            return y, y_perturbed
示例#2
0
文件: NAF_ip.py 项目: posvyatokum/RL
    def run_n_episodes(self,
                       num_episodes,
                       max_ep_length,
                       minibatch_size,
                       explore=True,
                       num_updates=5,
                       summary_checkpoint=1,
                       eta=0.01,
                       num_updates_ac=1,
                       T=1):  #num_updates from article
        for i in range(num_episodes):
            noise = OUNoise(self.a_dim)
            x = self.env.reset()
            x = x.reshape(1, -1)
            u = np.zeros(self.s_dim)
            t = False
            episodes_reward = 0

            #for REINFORCE
            self.r_rs.append([])
            self.r_xs.append([])
            self.r_us.append([])
            self.episodes_ls.append(0)
            while True:
                self.episodes_ls[-1] = self.episodes_ls[-1] + 1
                if self.det:
                    u, V = self.sess.run((self.model.mu_det, self.model.V),
                                         feed_dict={self.model.inputs_x: x})
                    self.episodes_Vs.append(V)
                else:
                    u, P, sigma, V = self.sess.run(
                        (self.model.mu_norm, self.model.P, self.model.sigma,
                         self.model.V),
                        feed_dict={self.model.inputs_x: x})
                    self.episodes_Ps.append(P)
                    self.episodes_ss.append(sigma)
                    self.episodes_Vs.append(V)
                    if self.separate_V:
                        self.episodes_V_s.append(self.critic.predict_V_sep(x))

                if explore:
                    u += noise.noise()
                    u = np.clip(u, -1.0, 1.0)

                u = u.reshape(1, -1)
                x1, r, t, info = self.env.step(u.reshape(-1))
                self.r_xs[-1].append(x)
                self.r_us[-1].append(u)
                self.r_rs[-1].append(r)
                episodes_reward += r
                self.buffer.add(x.reshape(1, -1), u, r, t, x1.reshape(1, -1))
                self.episodes_xs.append(x)
                self.episodes_us.append(u)
                self.episodes_rs.append(r)
                #Actor-Critic
                x = x1.reshape(1, -1)

                if self.qNAF:
                    for k in range(num_updates):
                        x_batch, u_batch, r_batch, t_batch, x1_batch = \
                            self.buffer.sample_batch(minibatch_size)
                        x_batch, u_batch, r_batch, t_batch, x1_batch = \
                            x_batch.reshape(-1, self.s_dim), u_batch.reshape(-1, self.a_dim), r_batch.reshape(-1, 1),\
                             t_batch.reshape(-1), x1_batch.reshape(-1, self.s_dim)

                        if self.qNAF:
                            y_batch = self.gamma * self.target_model.predict_V(
                                x1_batch) + r_batch
                            self.model.update_Q(x_batch, u_batch, y_batch)

                        self.target_model.soft_update_from(self.model)
                if t:
                    break
            if self.ac:
                r_xs_l = np.array(self.r_xs[-1]).reshape(-1, self.s_dim)
                r_rs_l = np.array(self.r_rs[-1]).reshape(-1, 1)
                for idx in range(2, len(r_rs_l) + 1):
                    r_rs_l[-idx] += self.gamma * r_rs_l[-idx + 1]
                self.r_rs[-1] = r_rs_l
                r_rs_ = np.array(self.r_rs).reshape(-1, 1)
                r_xs_ = np.array(self.r_xs).reshape(-1, self.s_dim)
                r_us_ = np.array(self.r_us).reshape(-1, self.a_dim)
                for _ in range(num_updates_ac):
                    #update V every episode
                    if self.separate_V:
                        self.critic.update_V_sep(r_xs_l, r_rs_l)
                    if i % T == 0:
                        #Q_target = r_rs_[:-1] + self.gamma * self.critic.predict_V_sep(r_xs_[1:])
                        #Q_target = np.vstack((Q_target, (r_rs_[-1])))
                        deltas = r_rs_
                        if self.separate_V:
                            deltas = deltas - self.critic.predict_V_sep(r_xs_)
                        else:
                            deltas = deltas - self.target_model.predict_V(
                                r_xs_)
                        '''
                        loss = self.sess.run((self.model.loss_spg),
                                                              feed_dict={self.model.inputs_x: r_xs_,
                                                                         self.model.inputs_u: r_us_,
                                                                         self.model.inputs_Q: deltas})
                        print('loss before update', loss)                     
                        '''
                        self.model.update_mu(r_xs_, r_us_, deltas)
                        self.target_model.soft_update_from(self.model)
                        '''
                        loss = self.sess.run((self.model.loss_spg),
                                                              feed_dict={self.model.inputs_x: r_xs_,
                                                                         self.model.inputs_u: r_us_,
                                                                         self.model.inputs_Q: deltas})
                        print('loss after update', loss)                     
                        '''
                        #self.target_model.soft_update_from(self.model)
                        self.r_rs = []
                        self.r_xs = []
                        self.r_us = []

            if summary_checkpoint > 0 and i % summary_checkpoint == 0:
                print('| Reward: %.2i' % int(episodes_reward), " | Episode", i)
                self.plot_rewards(self.summary_dir)
            self.rewards.append(episodes_reward)
示例#3
0
class RDPGAgent:
    def __init__(self, env, batchSize = 10, bufferSize = 100,
                 gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3,
                 maxSteps = 200, targetUpdate = 1e-3, epsilon = 1,
                 decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'):
        self.env = env
        self.gamma = gamma
        self.batchSize = batchSize
        self.bufferSize = bufferSize
        self.maxSteps = maxSteps + 1
        self.rewardScale = rewardScale
        self.epsilon = epsilon
        self.decay = decay

        # Useful helpers.
        self.actionDim = self.env.action_space.shape[0]
        self.stateDim = self.env.observation_space.shape[0]
        self.featureDim = self.actionDim + self.stateDim
        self.minAction = self.env.action_space.low
        self.maxAction = self.env.action_space.high

        # For scaling output action values.
        self.actionBiasZeroOne = self.minAction
        self.actionScaleZeroOne = self.maxAction - self.minAction
        self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0
        self.actionScaleTanH = self.maxAction - self.actionBiasTanH 

        # Initialize noise process.
        self.noise = OUNoise(self.actionDim)

        # Initialize replay buffer.
        self.buffer = ReplayBuffer(self.bufferSize)

        # Initialize logging.
        logging.basicConfig(filename = logFile,
                            level = logging.INFO,
                            format = '[%(asctime)s] %(message)s',
                            datefmt = '%m/%d/%Y %I:%M:%S %p')
        logging.info('Initializing DRPG agent with passed settings.')

        # Tensorflow GPU optimization.
        config = tf.ConfigProto() # GPU fix?
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config = config)
        from keras import backend as K
        K.set_session(self.sess)

        # Make actor network (creates target model internally).
        self.actor = Actor(self.sess, self.maxSteps, self.featureDim,
                           self.actionDim, self.batchSize, targetUpdate,
                           actorLR, self.actionScaleTanH, self.actionBiasTanH)

        # Make critic network (creates target model internally).
        self.critic = Critic(self.sess, self.maxSteps, self.featureDim,
                             self.actionDim, self.batchSize, targetUpdate,
                             actorLR)

    # Train or run for some number of episodes.
    def run(self, numEpisodes, training = False, warmUp = 30):
        for i in range(numEpisodes):
            sequence = []
            totalReward = 0
            totalSteps = 0
            o = self.env.reset()

            # Stores (O1, A1, O2, A2, etc) for prediction.
            history = np.zeros((self.maxSteps * self.featureDim))
            history[:self.stateDim] = o
            for j in range(self.maxSteps - 1):
                # We do this reshaping to get history into (BatchSize, TimeSteps, Dims).
                batchedHistory = np.reshape(history, (self.maxSteps, self.featureDim))
                batchedHistory = np.expand_dims(batchedHistory, axis = 0)

                # Predict action or use random with e-greedy.
                # if (np.random.random_sample() < self.epsilon and training):
                #     a = np.random.random((self.actionDim))
                #     a = a * self.actionScaleZeroOne
                #     a = a + self.actionBiasZeroOne
                # else:
                #     a = self.actor.model.predict(batchedHistory)[0]

                # Predict an action and add noise to it for exploration purposes.
                a = self.actor.model.predict(batchedHistory)[0] + self.epsilon * self.noise.noise()
                a = np.clip(a, self.minAction, self.maxAction)

                # Take a single step.
                oPrime, r, d, _ = self.env.step(a)
                r *= self.rewardScale

                newTimeStart = (j + 1) * self.featureDim
                # Update agent state and ongoing agent history data. History is
                # passed to our actor for prediction, and sequence is for later.
                history[j * self.featureDim + self.stateDim:newTimeStart] = a
                history[newTimeStart:(j + 1) * self.featureDim + self.stateDim] = oPrime
                sequence.append({'o': o, 'a': a, 'r': r, 'd': d})
                totalReward += r
                totalSteps += 1
                o = oPrime

                # Quit early.
                if d: break

            # Anneal epsilon.
            if i > warmUp:
                self.epsilon *= self.decay

            # Print some episode debugging and reward information.
            print('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale))
            logging.info('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale))

            # Simulation only.
            if not training:
                continue

            # Add sequence to buffer.
            self.buffer.add(sequence)

            # Resample sequences from the buffer
            samples = self.buffer.getBatch(self.batchSize)
            numSamples = len(samples)

            # Do not train until we have
            # seen self.warmUp episodes.
            if self.buffer.getCount() < warmUp:
                continue

            # Some more debug info.
            print('Training on sampled sequences from all episodes.')
            logging.info('Training on sampled sequences from all episodes.')

            # All of these do not include time step t = T.
            # Used to store H[i, t] for each episode i and step t.
            sampleHistories = np.zeros((numSamples, self.maxSteps - 1,
                                        self.maxSteps * self.featureDim))
            # Used to store H[i, t + 1] for each episode i and step t.
            sampleHistoriesWithNext = np.zeros((numSamples, self.maxSteps - 1,
                                                self.maxSteps * self.featureDim))
            # Used to store R[i, t] for each episode i and step t.
            sampleRewards = np.zeros((numSamples, self.maxSteps - 1))
            # Used to store NotDone[i, t] for each episode i and step t.
            sampleNotDoneMasks = np.zeros((numSamples, self.maxSteps - 1))
            # Used to store action[i, t] taken for each episode i and step t.
            sampleActions = np.zeros((numSamples, self.maxSteps - 1, self.actionDim))

            # Compute info for each episode i.
            for i in range(numSamples):
                sample = samples[i]
                historySoFar = np.zeros((self.maxSteps * self.featureDim))
                # Iteratively build up historySoFar for each timestep t.
                for t in range(len(sample) - 1):
                    step, nextStep = sample[i], sample[i + 1]
                    # This is (oT, aT), which we are adding to running history.
                    history = np.concatenate([step['o'], step['a']], axis = 0)
                    historySoFar[t * self.featureDim:(t + 1) * self.featureDim] = history

                    # This is (o1, a1, o2, a2 ... ot).
                    sampleHistoryEnd = (t + 1) * self.featureDim - self.actionDim
                    sampleHistories[i, t, :sampleHistoryEnd] = historySoFar[:sampleHistoryEnd]

                    # This is (o1, a1, o2, a2 ... ot, at, ot+1).
                    sampleNextEnd = (t + 1) * self.featureDim
                    sampleHistoriesWithNext[i, t, :sampleNextEnd] = historySoFar[:sampleNextEnd]
                    sampleHistoriesWithNext[i, t, sampleNextEnd:sampleNextEnd + self.stateDim] = nextStep['o']

                    # Set rewards and not done masks.
                    sampleRewards[i, t] = step['r']
                    sampleActions[i, t] = step['a']
                    sampleNotDoneMasks[i, t] = 0 if step['d'] else 1

            # Separate out self.maxSteps since it is the timestep dimension for RNN.
            sampleHistories = np.reshape(sampleHistories, (numSamples, self.maxSteps - 1,
                                                           self.maxSteps, self.featureDim))

            # Separate out self.maxSteps since it is the timestep dimension for RNN.
            sampleHistoriesWithNext = np.reshape(sampleHistoriesWithNext, (numSamples, self.maxSteps - 1,
                                                                           self.maxSteps, self.featureDim))

            # Update models using samples, rewards, and masks.
            self.update(numSamples, sampleHistories, sampleHistoriesWithNext,
                        sampleRewards, sampleActions, sampleNotDoneMasks)

    # Given a bunch of experienced histories, update our models.
    def update(self, numSamples, histories, historiesNext, rewards, chosenActions, notDoneMasks):
        # Reshape [i, t] pairs to a single dimension, which will be the RNN batch dimension.
        historiesBatch = np.reshape(histories, (-1, self.maxSteps, self.featureDim))
        historiesNextBatch = np.reshape(historiesNext, (-1, self.maxSteps, self.featureDim))

        # Compute QSample targets [y] for updating the critic Q[S][A] outputs.
        targetActions = self.actor.target.predict(historiesNextBatch) # (B * (T - 1), F).
        targetQ = self.critic.target.predict([historiesNextBatch, targetActions]) # (B * (T - 1), 1).
        targetQ = np.reshape(targetQ, (numSamples, self.maxSteps - 1)) # (B, T - 1).
        y = rewards + notDoneMasks * (self.gamma * targetQ) # (B, T - 1)
        y = np.reshape(y, (numSamples * (self.maxSteps - 1), 1)) # (B * (T - 1), 1)

        # Train the critic model, passing in both the history and chosen actions.
        chosenActionsFlat = np.reshape(chosenActions, (numSamples * (self.maxSteps - 1), self.actionDim))
        # print (chosenActionsFlat.shape, historiesBatch.shape, historiesNextBatch.shape)
        self.critic.model.train_on_batch([historiesBatch, chosenActionsFlat], y)

        # Compute the gradient of the critic output WRT to its action input.
        # We cannot use chosenActions here since those were noisy predictions.
        currentActionsForGrad = self.actor.model.predict(historiesBatch)
        currentActionsGrad = self.critic.modelActionGradients(historiesBatch, currentActionsForGrad)

        # Train the actor model using the critic gradient WRT action input.
        self.actor.trainModel(historiesBatch, currentActionsGrad)

        # Update target models.
        self.actor.trainTarget()
        self.critic.trainTarget()