def p_train(observPlaceHolderList,
            actionSpaceList,
            agentIndex,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping,
            ddpg,
            num_units=64,
            scope="trainer",
            reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        actionPlaceHolderList = [
            tf.placeholder(dtype=tf.float32,
                           shape=[None] + [actionSpaceList[i].n],
                           name="action" + str(i))
            for i in range(len(actionSpaceList))
        ]

        policyNetInput = observPlaceHolderList[
            agentIndex]  # personal observation
        policyOutputShape = int(actionSpaceList[agentIndex].n)
        policyTrainOutput = p_func(policyNetInput,
                                   policyOutputShape,
                                   scope="p_func",
                                   num_units=num_units)
        policyNetVariables = U.scope_vars(U.absolute_scope_name("p_func"))

        sampleNoise = tf.random_uniform(tf.shape(policyTrainOutput), seed=0)
        actionSample = U.softmax(policyTrainOutput -
                                 tf.log(-tf.log(sampleNoise)),
                                 axis=-1)  # output of function act
        p_reg = tf.reduce_mean(tf.square(policyTrainOutput))

        actionInputPlaceHolderList = actionPlaceHolderList + []
        actionInputPlaceHolderList[agentIndex] = actionSample

        qNetInput = tf.concat(
            observPlaceHolderList + actionInputPlaceHolderList, 1)
        if ddpg:
            qNetInput = tf.concat(
                [observPlaceHolderList[agentIndex], actionSample], 1)

        q = q_func(qNetInput,
                   1,
                   scope="q_func",
                   reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3  ####### didnt change this optimization process in my ddpg

        optimize_expr = U.minimize_and_clip(optimizer, loss,
                                            policyNetVariables,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList +
                           actionPlaceHolderList,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[observPlaceHolderList[agentIndex]],
                         outputs=actionSample)
        p_values = U.function([observPlaceHolderList[agentIndex]],
                              policyTrainOutput)

        # target network
        target_p = p_func(policyNetInput,
                          int(actionSpaceList[agentIndex].n),
                          scope="target_p_func",
                          num_units=num_units)
        targetNetVariables = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(policyNetVariables,
                                          targetNetVariables)

        uTarget = tf.random_uniform(tf.shape(target_p))
        target_act_sample = U.softmax(target_p - tf.log(-tf.log(uTarget)),
                                      axis=-1)
        target_act = U.function(inputs=[observPlaceHolderList[agentIndex]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
示例#2
0
    def __call__(self, layersWidths, agentID=None):
        agentStr = 'Agent' + str(agentID) if agentID is not None else ''
        print("Generating Actor NN Model with layers: {}".format(layersWidths))
        graph = tf.Graph()
        with graph.as_default():
            with tf.name_scope("inputs" + agentStr):
                states_ = tf.placeholder(tf.float32,
                                         [None, self.numStateSpace],
                                         name='states_')
                qVal_ = tf.placeholder(tf.float32, [None, 1], name='qVal_')

                tf.add_to_collection("states_", states_)
                tf.add_to_collection("qVal_", qVal_)

            with tf.name_scope("trainingParams" + agentStr):
                learningRate_ = tf.constant(0, dtype=tf.float32)
                tau_ = tf.constant(0, dtype=tf.float32)
                tf.add_to_collection("learningRate_", learningRate_)
                tf.add_to_collection("tau_", tau_)

            with tf.variable_scope("trainHidden" + agentStr):
                activation_ = states_
                for i in range(len(layersWidths)):
                    # activation_ = layers.fully_connected(activation_, num_outputs= layersWidths[i], activation_fn=tf.nn.relu,
                    #                                      scope="fc{}".format(i+1), weights_initializer=tf.initializers.glorot_uniform(seed=0))
                    activation_ = layers.fully_connected(
                        activation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu,
                        scope="fc{}".format(i + 1))

                trainActivationOutput_ = layers.fully_connected(
                    activation_,
                    num_outputs=self.actionDim,
                    activation_fn=None,
                    scope="fc{}".format(len(layersWidths) + 1))

            with tf.variable_scope("targetHidden" + agentStr):
                activation_ = states_
                for i in range(len(layersWidths)):
                    activation_ = layers.fully_connected(
                        activation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu,
                        scope="fc{}".format(i + 1))

                targetActivationOutput_ = layers.fully_connected(
                    activation_,
                    num_outputs=self.actionDim,
                    activation_fn=None,
                    scope="fc{}".format(len(layersWidths) + 1))

            with tf.name_scope("updateParameters" + agentStr):
                trainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope='trainHidden')
                targetParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope='targetHidden')
                updateParam_ = [
                    targetParams_[i].assign((1 - tau_) * targetParams_[i] +
                                            tau_ * trainParams_[i])
                    for i in range(len(targetParams_))
                ]

                tf.add_to_collection("trainParams_", trainParams_)
                tf.add_to_collection("targetParams_", targetParams_)
                tf.add_to_collection("updateParam_", updateParam_)

                hardReplaceTargetParam_ = [
                    tf.assign(trainParam,
                              targetParam) for trainParam, targetParam in zip(
                                  trainParams_, targetParams_)
                ]
                tf.add_to_collection("hardReplaceTargetParam_",
                                     hardReplaceTargetParam_)

            with tf.name_scope("output" + agentStr):
                trainAction_ = tf.multiply(trainActivationOutput_,
                                           self.actionRange,
                                           name='trainAction_')
                targetAction_ = tf.multiply(targetActivationOutput_,
                                            self.actionRange,
                                            name='targetAction_')

                sampleNoiseTrain_ = tf.random_uniform(
                    tf.shape(trainActivationOutput_))
                noisyTrainAction_ = U.softmax(
                    trainActivationOutput_ -
                    tf.log(-tf.log(sampleNoiseTrain_)),
                    axis=-1)  # give this to q input

                tf.add_to_collection("sampleNoiseTrain_", sampleNoiseTrain_)

                sampleNoiseTarget_ = tf.random_uniform(
                    tf.shape(targetActivationOutput_))
                noisyTargetAction_ = U.softmax(
                    targetActivationOutput_ -
                    tf.log(-tf.log(sampleNoiseTarget_)),
                    axis=-1)

                tf.add_to_collection("trainAction_", trainAction_)
                tf.add_to_collection("targetAction_", targetAction_)

                tf.add_to_collection("noisyTrainAction_", noisyTrainAction_)
                tf.add_to_collection("noisyTargetAction_", noisyTargetAction_)

            with tf.name_scope("train" + agentStr):
                p_reg = tf.reduce_mean(tf.square(trainActivationOutput_))
                pg_loss = -tf.reduce_mean(qVal_)
                actorLoss_ = pg_loss + p_reg * 1e-3

                tf.summary.scalar("pg_loss", pg_loss)
                tf.add_to_collection("actorLoss_", actorLoss_)

                # optimizer = tf.train.AdamOptimizer(learningRate_, name='adamOptimizer')
                # grad_norm_clipping = 0.5
                # trainOpt_ = U.minimize_and_clip(optimizer, actorLoss_, trainParams_, grad_norm_clipping)

                optimizer = tf.train.AdamOptimizer(learningRate_,
                                                   name='adamOptimizer')
                grad_norm_clipping = 0.5

                gradients = optimizer.compute_gradients(actorLoss_,
                                                        var_list=trainParams_)
                for i, (grad, var) in enumerate(gradients):
                    if grad is not None:
                        gradients[i] = (tf.clip_by_norm(
                            grad, grad_norm_clipping), var)

                with tf.name_scope("inspectGrad"):
                    for i, (grad_, var_) in enumerate(gradients):
                        keyPrefix = "weightGradient" if "weights" in var_.name else "biasGradient"
                        tf.add_to_collection(f"{keyPrefix}/{var_.name}", grad_)
                    gradients_ = [
                        tf.reshape(grad_, [1, -1])
                        for i, (grad_, var_) in enumerate(gradients)
                    ]
                    allGradTensor_ = tf.concat(gradients_, 1)
                    allGradNorm_ = tf.norm(allGradTensor_)
                    tf.add_to_collection("allGradNorm", allGradNorm_)
                    tf.summary.histogram("allGradients", allGradTensor_)
                    tf.summary.scalar("allGradNorm", allGradNorm_)

                trainOpt_ = optimizer.apply_gradients(gradients)

                tf.add_to_collection("trainOpt_", trainOpt_)

            with tf.name_scope("summary" + agentStr):
                actorLossSummary_ = tf.identity(actorLoss_)
                tf.add_to_collection("actorLossSummary_", actorLossSummary_)
                tf.summary.scalar("actorLossSummary", actorLossSummary_)

            fullSummary = tf.summary.merge_all()
            tf.add_to_collection("summaryOps", fullSummary)

            actorSaver = tf.train.Saver(max_to_keep=None)
            tf.add_to_collection("saver", actorSaver)

            model = tf.Session(graph=graph)
            model.run(tf.global_variables_initializer())

            actorWriter = tf.summary.FileWriter('tensorBoard/actorOnlineDDPG' +
                                                agentStr,
                                                graph=graph)
            tf.add_to_collection("actorWriter", actorWriter)

        return actorWriter, model
    def __call__(self, layersWidths, agentID=None):
        agentStr = 'Agent' + str(agentID) if agentID is not None else ''
        graph = tf.Graph()
        with graph.as_default():
            with tf.variable_scope("inputs/" + agentStr):
                states_ = tf.placeholder(tf.float32,
                                         [None, self.numStateSpace],
                                         name='states_')
                nextStates_ = tf.placeholder(tf.float32,
                                             [None, self.numStateSpace],
                                             name='nextStates_')
                action_ = tf.stop_gradient(tf.placeholder(
                    tf.float32, [None, self.actionDim]),
                                           name='action_')
                reward_ = tf.placeholder(tf.float32, [None, 1], name='reward_')

                tf.add_to_collection("states_", states_)
                tf.add_to_collection("nextStates_", nextStates_)
                tf.add_to_collection("action_", action_)
                tf.add_to_collection("reward_", reward_)

            with tf.variable_scope("trainingParams" + agentStr):
                learningRate_ = tf.constant(0, dtype=tf.float32)
                tau_ = tf.constant(0, dtype=tf.float32)
                gamma_ = tf.constant(0, dtype=tf.float32)

                tf.add_to_collection("learningRate_", learningRate_)
                tf.add_to_collection("tau_", tau_)
                tf.add_to_collection("gamma_", gamma_)

            with tf.variable_scope("actor/trainHidden/" + agentStr):
                actorTrainActivation_ = states_
                for i in range(len(layersWidths)):
                    actorTrainActivation_ = layers.fully_connected(
                        actorTrainActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                actorTrainActivation_ = layers.fully_connected(
                    actorTrainActivation_,
                    num_outputs=self.actionDim,
                    activation_fn=None)

            with tf.variable_scope("actor/targetHidden/" + agentStr):
                actorTargetActivation_ = nextStates_
                for i in range(len(layersWidths)):
                    actorTargetActivation_ = layers.fully_connected(
                        actorTargetActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                actorTargetActivation_ = layers.fully_connected(
                    actorTargetActivation_,
                    num_outputs=self.actionDim,
                    activation_fn=None)

            with tf.variable_scope("actorNetOutput/" + agentStr):
                trainAction_ = tf.multiply(actorTrainActivation_,
                                           self.actionRange,
                                           name='trainAction_')
                targetAction_ = tf.multiply(actorTargetActivation_,
                                            self.actionRange,
                                            name='targetAction_')

                trainActionSpread = []
                batchSize = tf.shape(trainAction_)[0]
                trainActionReshaped_ = tf.reshape(
                    trainAction_,
                    [self.numAgentsToControl, batchSize, self.singleActionDim])
                for i in range(self.numAgentsToControl):
                    agentAction_ = trainActionReshaped_[i]
                    sampleNoiseTrainAgent_ = tf.random_uniform(
                        tf.shape(agentAction_))
                    agentNoisyTrainAction_ = U.softmax(
                        agentAction_ - tf.log(-tf.log(sampleNoiseTrainAgent_)),
                        axis=-1)  # give this to q input
                    trainActionSpread.append(agentNoisyTrainAction_)
                noisyTrainAction_ = tf.concat(trainActionSpread, axis=1)

                targetActionSpread = []
                batchSize = tf.shape(targetAction_)[0]
                targetActionReshaped_ = tf.reshape(
                    targetAction_,
                    [self.numAgentsToControl, batchSize, self.singleActionDim])
                for i in range(self.numAgentsToControl):
                    agentAction_ = targetActionReshaped_[i]
                    sampleNoiseTargetAgent_ = tf.random_uniform(
                        tf.shape(agentAction_))
                    agentNoisyTargetAction_ = U.softmax(
                        agentAction_ -
                        tf.log(-tf.log(sampleNoiseTargetAgent_)),
                        axis=-1)  # give this to q input
                    targetActionSpread.append(agentNoisyTargetAction_)
                noisyTargetAction_ = tf.concat(targetActionSpread, axis=1)

                tf.add_to_collection("trainAction_", trainAction_)
                tf.add_to_collection("targetAction_", targetAction_)

                tf.add_to_collection("noisyTrainAction_", noisyTrainAction_)
                tf.add_to_collection("noisyTargetAction_", noisyTargetAction_)

            with tf.variable_scope("critic/trainHidden/" + agentStr):
                criticTrainActivationOfGivenAction_ = tf.concat(
                    [states_, action_], axis=1)
                for i in range(len(layersWidths)):
                    criticTrainActivationOfGivenAction_ = layers.fully_connected(
                        criticTrainActivationOfGivenAction_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                criticTrainActivationOfGivenAction_ = layers.fully_connected(
                    criticTrainActivationOfGivenAction_,
                    num_outputs=1,
                    activation_fn=None)

            with tf.variable_scope("critic/trainHidden/" + agentStr,
                                   reuse=True):
                criticTrainActivation_ = tf.concat(
                    [states_, noisyTrainAction_], axis=1)
                for i in range(len(layersWidths)):
                    criticTrainActivation_ = layers.fully_connected(
                        criticTrainActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                criticTrainActivation_ = layers.fully_connected(
                    criticTrainActivation_, num_outputs=1, activation_fn=None)

            with tf.variable_scope("critic/targetHidden/" + agentStr):
                criticTargetActivation_ = tf.concat(
                    [nextStates_, noisyTargetAction_], axis=1)
                for i in range(len(layersWidths)):
                    criticTargetActivation_ = layers.fully_connected(
                        criticTargetActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                criticTargetActivation_ = layers.fully_connected(
                    criticTargetActivation_, num_outputs=1, activation_fn=None)

            with tf.variable_scope("updateParameters/" + agentStr):
                actorTrainParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='actor/trainHidden/' + agentStr)
                actorTargetParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='actor/targetHidden/' + agentStr)
                actorUpdateParam_ = [
                    actorTargetParams_[i].assign((1 - tau_) *
                                                 actorTargetParams_[i] +
                                                 tau_ * actorTrainParams_[i])
                    for i in range(len(actorTargetParams_))
                ]

                tf.add_to_collection("actorTrainParams_", actorTrainParams_)
                tf.add_to_collection("actorTargetParams_", actorTargetParams_)
                tf.add_to_collection("actorUpdateParam_", actorUpdateParam_)

                hardReplaceActorTargetParam_ = [
                    tf.assign(trainParam,
                              targetParam) for trainParam, targetParam in zip(
                                  actorTrainParams_, actorTargetParams_)
                ]
                tf.add_to_collection("hardReplaceActorTargetParam_",
                                     hardReplaceActorTargetParam_)

                criticTrainParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='critic/trainHidden/' + agentStr)
                criticTargetParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='critic/targetHidden/' + agentStr)

                criticUpdateParam_ = [
                    criticTargetParams_[i].assign((1 - tau_) *
                                                  criticTargetParams_[i] +
                                                  tau_ * criticTrainParams_[i])
                    for i in range(len(criticTargetParams_))
                ]

                tf.add_to_collection("criticTrainParams_", criticTrainParams_)
                tf.add_to_collection("criticTargetParams_",
                                     criticTargetParams_)
                tf.add_to_collection("criticUpdateParam_", criticUpdateParam_)

                hardReplaceCriticTargetParam_ = [
                    tf.assign(trainParam,
                              targetParam) for trainParam, targetParam in zip(
                                  criticTrainParams_, criticTargetParams_)
                ]
                tf.add_to_collection("hardReplaceCriticTargetParam_",
                                     hardReplaceCriticTargetParam_)

                updateParam_ = actorUpdateParam_ + criticUpdateParam_
                hardReplaceTargetParam_ = hardReplaceActorTargetParam_ + hardReplaceCriticTargetParam_
                tf.add_to_collection("updateParam_", updateParam_)
                tf.add_to_collection("hardReplaceTargetParam_",
                                     hardReplaceTargetParam_)

            with tf.variable_scope("trainActorNet/" + agentStr):
                trainQ = criticTrainActivation_[:, 0]
                pg_loss = -tf.reduce_mean(trainQ)
                p_reg = tf.reduce_mean(tf.square(actorTrainActivation_))
                actorLoss_ = pg_loss + p_reg * 1e-3

                actorOptimizer = tf.train.AdamOptimizer(learningRate_,
                                                        name='actorOptimizer')
                grad_norm_clipping = 0.5
                actorTrainOpt_ = U.minimize_and_clip(actorOptimizer,
                                                     actorLoss_,
                                                     actorTrainParams_,
                                                     grad_norm_clipping)

                tf.add_to_collection("actorLoss_", actorLoss_)
                tf.add_to_collection("actorTrainOpt_", actorTrainOpt_)

            with tf.variable_scope("trainCriticNet/" + agentStr):
                yi_ = reward_ + gamma_ * criticTargetActivation_
                criticLoss_ = tf.reduce_mean(
                    tf.squared_difference(
                        tf.squeeze(yi_),
                        tf.squeeze(criticTrainActivationOfGivenAction_)))

                tf.add_to_collection("yi_", yi_)
                tf.add_to_collection("valueLoss_", criticLoss_)

                criticOptimizer = tf.train.AdamOptimizer(
                    learningRate_, name='criticOptimizer')
                grad_norm_clipping = 0.5
                crticTrainOpt_ = U.minimize_and_clip(criticOptimizer,
                                                     criticLoss_,
                                                     criticTrainParams_,
                                                     grad_norm_clipping)

                tf.add_to_collection("crticTrainOpt_", crticTrainOpt_)

            with tf.variable_scope("summary" + agentStr):
                criticLossSummary = tf.identity(criticLoss_)
                tf.add_to_collection("criticLossSummary", criticLossSummary)
                tf.summary.scalar("criticLossSummary", criticLossSummary)

            fullSummary = tf.summary.merge_all()
            tf.add_to_collection("summaryOps", fullSummary)

            saver = tf.train.Saver(max_to_keep=None)
            tf.add_to_collection("saver", saver)

            model = tf.Session(graph=graph)
            model.run(tf.global_variables_initializer())

            writer = tf.summary.FileWriter('tensorBoard/onlineDDPG/' +
                                           agentStr,
                                           graph=graph)
            tf.add_to_collection("writer", writer)

        return model
示例#4
0
    def __call__(self, layersWidths, agentID):
        agentStr = 'Agent'+ str(agentID)
        graph = tf.Graph()
        with graph.as_default():
            with tf.variable_scope("inputs/"+ agentStr):
                allAgentsStates_ = [tf.placeholder(dtype=tf.float32, shape=[None, agentObsDim], name="state"+str(i)) for i, agentObsDim in enumerate(self.obsShapeList)]
                allAgentsNextStates_ =  [tf.placeholder(dtype=tf.float32, shape=[None, agentObsDim], name="nextState"+str(i)) for i, agentObsDim in enumerate(self.obsShapeList)]

                allAgentsActions_ = [tf.placeholder(dtype=tf.float32, shape=[None, self.actionDim], name="action"+str(i)) for i in range(self.numAgents)]
                allAgentsNextActionsByTargetNet_ = [tf.placeholder(dtype=tf.float32, shape=[None, self.actionDim], name= "actionTarget"+str(i)) for i in range(self.numAgents)]

                agentReward_ = tf.placeholder(tf.float32, [None, 1], name='reward_')

                tf.add_to_collection("allAgentsStates_", allAgentsStates_)
                tf.add_to_collection("allAgentsNextStates_", allAgentsNextStates_)
                tf.add_to_collection("allAgentsActions_", allAgentsActions_)
                tf.add_to_collection("allAgentsNextActionsByTargetNet_", allAgentsNextActionsByTargetNet_)
                tf.add_to_collection("agentReward_", agentReward_)

            with tf.variable_scope("trainingParams" + agentStr):
                learningRate_ = tf.constant(0, dtype=tf.float32)
                tau_ = tf.constant(0, dtype=tf.float32)
                gamma_ = tf.constant(0, dtype=tf.float32)

                tf.add_to_collection("learningRate_", learningRate_)
                tf.add_to_collection("tau_", tau_)
                tf.add_to_collection("gamma_", gamma_)

            with tf.variable_scope("actor/trainHidden/"+ agentStr): # act by personal observation
                currentAgentState_ = allAgentsStates_[agentID]
                actorTrainActivation_ = currentAgentState_

                for i in range(len(layersWidths)):
                    actorTrainActivation_ = layers.fully_connected(actorTrainActivation_, num_outputs= layersWidths[i],
                                                                   activation_fn=tf.nn.relu)

                actorTrainActivation_ = layers.fully_connected(actorTrainActivation_, num_outputs= self.actionDim,
                                                               activation_fn= None)

            with tf.variable_scope("actor/targetHidden/"+ agentStr):
                currentAgentNextState_ = allAgentsNextStates_[agentID]
                actorTargetActivation_ = currentAgentNextState_

                for i in range(len(layersWidths)):
                    actorTargetActivation_ = layers.fully_connected(actorTargetActivation_, num_outputs= layersWidths[i],
                                                                    activation_fn=tf.nn.relu)

                actorTargetActivation_ = layers.fully_connected(actorTargetActivation_, num_outputs= self.actionDim,
                                                                activation_fn=None)

            with tf.variable_scope("actorNetOutput/"+ agentStr):
                trainAction_ = tf.multiply(actorTrainActivation_, self.actionRange, name='trainAction_')
                targetAction_ = tf.multiply(actorTargetActivation_, self.actionRange, name='targetAction_')

                sampleNoiseTrain_ = tf.random_uniform(tf.shape(trainAction_))
                noisyTrainAction_ = U.softmax(trainAction_ - tf.log(-tf.log(sampleNoiseTrain_)), axis=-1) # give this to q input

                sampleNoiseTarget_ = tf.random_uniform(tf.shape(targetAction_))
                noisyTargetAction_ = U.softmax(targetAction_ - tf.log(-tf.log(sampleNoiseTarget_)), axis=-1)

                tf.add_to_collection("trainAction_", trainAction_)
                tf.add_to_collection("targetAction_", targetAction_)

                tf.add_to_collection("noisyTrainAction_", noisyTrainAction_)
                tf.add_to_collection("noisyTargetAction_", noisyTargetAction_)


            with tf.variable_scope("critic/trainHidden/"+ agentStr):
                criticTrainActivationOfGivenAction_ = tf.concat(allAgentsStates_ + allAgentsActions_, axis=1)

                for i in range(len(layersWidths)):
                    criticTrainActivationOfGivenAction_ = layers.fully_connected(criticTrainActivationOfGivenAction_, num_outputs= layersWidths[i], activation_fn=tf.nn.relu)

                criticTrainActivationOfGivenAction_ = layers.fully_connected(criticTrainActivationOfGivenAction_, num_outputs= 1, activation_fn= None)

            with tf.variable_scope("critic/trainHidden/" + agentStr, reuse= True):
                criticInputActionList = allAgentsActions_ + []
                criticInputActionList[agentID] = noisyTrainAction_
                criticTrainActivation_ = tf.concat(allAgentsStates_ + criticInputActionList, axis=1)

                for i in range(len(layersWidths)):
                    criticTrainActivation_ = layers.fully_connected(criticTrainActivation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu)

                criticTrainActivation_ = layers.fully_connected(criticTrainActivation_, num_outputs=1, activation_fn=None)

            with tf.variable_scope("critic/targetHidden/"+ agentStr):
                criticTargetActivation_ = tf.concat(allAgentsNextStates_ + allAgentsNextActionsByTargetNet_, axis=1)
                for i in range(len(layersWidths)):
                    criticTargetActivation_ = layers.fully_connected(criticTargetActivation_, num_outputs= layersWidths[i],activation_fn=tf.nn.relu)

                criticTargetActivation_ = layers.fully_connected(criticTargetActivation_, num_outputs= 1,activation_fn=None)

            with tf.variable_scope("updateParameters/"+ agentStr):
                actorTrainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/trainHidden/'+ agentStr)
                actorTargetParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/targetHidden/'+ agentStr)
                actorUpdateParam_ = [actorTargetParams_[i].assign((1 - tau_) * actorTargetParams_[i] + tau_ * actorTrainParams_[i]) for i in range(len(actorTargetParams_))]

                tf.add_to_collection("actorTrainParams_", actorTrainParams_)
                tf.add_to_collection("actorTargetParams_", actorTargetParams_)
                tf.add_to_collection("actorUpdateParam_", actorUpdateParam_)

                hardReplaceActorTargetParam_ = [tf.assign(trainParam, targetParam) for trainParam, targetParam in zip(actorTrainParams_, actorTargetParams_)]
                tf.add_to_collection("hardReplaceActorTargetParam_", hardReplaceActorTargetParam_)

                criticTrainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/trainHidden/'+ agentStr)
                criticTargetParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/targetHidden/'+ agentStr)

                criticUpdateParam_ = [criticTargetParams_[i].assign((1 - tau_) * criticTargetParams_[i] + tau_ * criticTrainParams_[i]) for i in range(len(criticTargetParams_))]

                tf.add_to_collection("criticTrainParams_", criticTrainParams_)
                tf.add_to_collection("criticTargetParams_", criticTargetParams_)
                tf.add_to_collection("criticUpdateParam_", criticUpdateParam_)

                hardReplaceCriticTargetParam_ = [tf.assign(trainParam, targetParam) for trainParam, targetParam in zip(criticTrainParams_, criticTargetParams_)]
                tf.add_to_collection("hardReplaceCriticTargetParam_", hardReplaceCriticTargetParam_)

                updateParam_ = actorUpdateParam_ + criticUpdateParam_
                hardReplaceTargetParam_ = hardReplaceActorTargetParam_ + hardReplaceCriticTargetParam_
                tf.add_to_collection("updateParam_", updateParam_)
                tf.add_to_collection("hardReplaceTargetParam_", hardReplaceTargetParam_)


            with tf.variable_scope("trainActorNet/"+ agentStr):
                trainQ = criticTrainActivation_[:, 0]
                pg_loss = -tf.reduce_mean(trainQ)
                p_reg = tf.reduce_mean(tf.square(actorTrainActivation_))
                actorLoss_ = pg_loss + p_reg * 1e-3

                actorOptimizer = tf.train.AdamOptimizer(learningRate_, name='actorOptimizer')
                actorTrainOpt_ = U.minimize_and_clip(actorOptimizer, actorLoss_, actorTrainParams_, self.gradNormClipping)

                tf.add_to_collection("actorLoss_", actorLoss_)
                tf.add_to_collection("actorTrainOpt_", actorTrainOpt_)

            with tf.variable_scope("trainCriticNet/"+ agentStr):
                yi_ = agentReward_ + gamma_ * criticTargetActivation_
                criticLoss_ = tf.reduce_mean(tf.squared_difference(tf.squeeze(yi_), tf.squeeze(criticTrainActivationOfGivenAction_)))

                tf.add_to_collection("yi_", yi_)
                tf.add_to_collection("valueLoss_", criticLoss_)

                criticOptimizer = tf.train.AdamOptimizer(learningRate_, name='criticOptimizer')
                crticTrainOpt_ = U.minimize_and_clip(criticOptimizer, criticLoss_, criticTrainParams_, self.gradNormClipping)

                tf.add_to_collection("crticTrainOpt_", crticTrainOpt_)

            with tf.variable_scope("summary"+ agentStr):
                criticLossSummary = tf.identity(criticLoss_)
                tf.add_to_collection("criticLossSummary", criticLossSummary)
                tf.summary.scalar("criticLossSummary", criticLossSummary)

            fullSummary = tf.summary.merge_all()
            tf.add_to_collection("summaryOps", fullSummary)

            saver = tf.train.Saver(max_to_keep=None)
            tf.add_to_collection("saver", saver)

            model = tf.Session(graph=graph)
            model.run(tf.global_variables_initializer())

            writer = tf.summary.FileWriter('tensorBoard/onlineDDPG/'+ agentStr, graph= graph)
            tf.add_to_collection("writer", writer)

        return model
 def sample(self):
     u = tf.random_uniform(tf.shape(self.logits))
     return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1)
示例#6
0
 def mode(self):
     return U.softmax(self.logits, axis=-1)