Пример #1
0
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    actorModelPath = os.path.join(
        dirName, '..', 'trainedDDPGModels',
        'Eps=200_actorModel=0_batch=128_env=Pendulum-v0_gam=0.9_lrActor=0.001_lrCritic=0.001_noiseVar=3_timeStep=200_varDiscout=0.9995.ckpt'
    )
    restoreVariables(actorModel, actorModelPath)
    actOneStep = ActDDPGOneStep(actionLow,
                                actionHigh,
                                actByPolicyTrain,
                                actorModel,
                                getNoise=None)
    policy = lambda state: actOneStep(observe(state))

    isTerminal = isTerminalGymPendulum
    reset = ResetGymPendulum(seed)
    transit = TransitGymPendulum()
    rewardFunc = RewardGymPendulum(angle_normalize)

    for i in range(10):
        maxRunningSteps = 200
        sampleTrajectory = SampleTrajectory(maxRunningSteps, transit,
                                            isTerminal, rewardFunc, reset)
        trajectory = sampleTrajectory(policy)

        # plots& plot
        showDemo = True
        if showDemo:
            visualize = VisualizeGymPendulum()
            visualize(trajectory)
Пример #2
0
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymPendulum()
    getReward = RewardGymPendulum(angle_normalize)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                            isTerminalGymPendulum)

    dirName = os.path.dirname(__file__)
    modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum')
    getTrainedModel = lambda: trainModels.actorModel
    modelSaveRate = 50
    saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                          modelPath)

    ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel])

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'pendulumTrajectory1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = True
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
    def __call__(self, df):
        actionDim = df.index.get_level_values('actionDim')[0]
        epsilonIncrease = df.index.get_level_values('epsilonIncrease')[0]

        stateDim = env.observation_space.shape[0]
        buildModel = BuildModel(stateDim, actionDim)
        layersWidths = [30]
        writer, model = buildModel(layersWidths)

        learningRate = 0.001
        gamma = 0.99
        trainModelBySASRQ = TrainModelBySASRQ(learningRate, gamma, writer)

        paramUpdateInterval = 300
        updateParameters = UpdateParameters(paramUpdateInterval)
        model = resetTargetParamToTrainParam([model])[0]
        trainModels = TrainDQNModel(getTargetQValue, trainModelBySASRQ,
                                    updateParameters, model)

        epsilonMax = 0.9
        epsilonMin = 0
        bufferSize = 10000
        decayStartStep = bufferSize
        getEpsilon = GetEpsilon(epsilonMax, epsilonMin, epsilonIncrease,
                                decayStartStep)

        actGreedyByModel = ActGreedyByModel(getTrainQValue, model)
        actRandom = ActRandom(actionDim)
        actByTrainNetEpsilonGreedy = ActByTrainNetEpsilonGreedy(
            getEpsilon, actGreedyByModel, actRandom)

        minibatchSize = 128
        learningStartBufferSize = minibatchSize
        sampleFromMemory = SampleFromMemory(minibatchSize)
        learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                          sampleFromMemory, trainModels)

        processAction = ProcessDiscretePendulumAction(actionDim)
        transit = TransitGymPendulum(processAction)
        getReward = RewardGymPendulum(angle_normalize, processAction)
        sampleOneStep = SampleOneStep(transit, getReward)

        runDQNTimeStep = RunTimeStep(actByTrainNetEpsilonGreedy, sampleOneStep,
                                     learnFromBuffer, observe)

        reset = ResetGymPendulum(seed)
        maxTimeStep = 200
        runEpisode = RunEpisode(reset, runDQNTimeStep, maxTimeStep,
                                isTerminalGymPendulum)

        maxEpisode = 400
        dqn = RunAlgorithm(runEpisode, maxEpisode)
        replayBuffer = deque(maxlen=int(bufferSize))
        meanRewardList, trajectory = dqn(replayBuffer)

        timeStep = list(range(len(meanRewardList)))
        resultSe = pd.Series(
            {time: reward
             for time, reward in zip(timeStep, meanRewardList)})

        return resultSe
Пример #4
0
    def __call__(self, df):
        noiseVariance = df.index.get_level_values('noiseInitVariance')[0]
        memorySize = df.index.get_level_values('memorySize')[0]

        buildActorModel = BuildActorModel(self.fixedParameters['stateDim'],
                                          self.fixedParameters['actionDim'],
                                          self.fixedParameters['actionBound'])
        actorWriter, actorModel = buildActorModel(
            self.fixedParameters['actorLayerWidths'])

        buildCriticModel = BuildCriticModel(self.fixedParameters['stateDim'],
                                            self.fixedParameters['actionDim'])
        criticWriter, criticModel = buildCriticModel(
            self.fixedParameters['criticLayerWidths'])

        trainCriticBySASRQ = TrainCriticBySASRQ(
            self.fixedParameters['learningRateCritic'],
            self.fixedParameters['gamma'], criticWriter)
        trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                                  trainCriticBySASRQ)

        trainActorFromGradients = TrainActorFromGradients(
            self.fixedParameters['learningRateActor'], actorWriter)
        trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                              trainActorFromGradients,
                                              getActionGradients)
        trainActor = TrainActor(trainActorOneStep)

        updateParameters = UpdateParameters(
            self.fixedParameters['paramUpdateInterval'],
            self.fixedParameters['tau'])

        modelList = [actorModel, criticModel]
        actorModel, criticModel = resetTargetParamToTrainParam(modelList)
        trainModels = TrainDDPGModels(updateParameters, trainActor,
                                      trainCritic, actorModel, criticModel)

        getNoise = GetExponentialDecayGaussNoise(
            noiseVariance, self.fixedParameters['varianceDiscount'],
            self.fixedParameters['noiseDecayStartStep'])
        actOneStepWithNoise = ActDDPGOneStep(
            self.fixedParameters['actionLow'],
            self.fixedParameters['actionHigh'], actByPolicyTrain, actorModel,
            getNoise)

        sampleFromMemory = SampleFromMemory(self.fixedParameters['batchSize'])
        learnFromBuffer = LearnFromBuffer(
            self.fixedParameters['learningStartStep'], sampleFromMemory,
            trainModels)

        transit = TransitGymPendulum()
        getReward = RewardGymPendulum(angle_normalize)
        sampleOneStep = SampleOneStep(transit, getReward)

        runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                      learnFromBuffer, observe)

        reset = ResetGymPendulum(seed)
        runEpisode = RunEpisode(reset, runDDPGTimeStep,
                                self.fixedParameters['maxRunSteps'],
                                isTerminalGymPendulum)

        ddpg = RunAlgorithm(runEpisode, self.fixedParameters['maxEpisode'])

        replayBuffer = deque(maxlen=int(memorySize))
        meanRewardList, trajectory = ddpg(replayBuffer)

        trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

        timeStep = list(range(len(meanRewardList)))
        resultSe = pd.Series(
            {time: reward
             for time, reward in zip(timeStep, meanRewardList)})

        if self.saveModel:
            actorParameters = {
                'ActorMemorySize': memorySize,
                'NoiseVariance': noiseVariance
            }
            criticParameters = {
                'CriticMemorySize': memorySize,
                'NoiseVariance': noiseVariance
            }
            actorPath = self.getSavePath(actorParameters)
            criticPath = self.getSavePath(criticParameters)
            with trainedActorModel.as_default():
                saveVariables(trainedActorModel, actorPath)
            with trainedCriticModel.as_default():
                saveVariables(trainedCriticModel, criticPath)

        return resultSe
Пример #5
0
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymPendulum()
    getReward = RewardGymPendulum(angle_normalize)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                            isTerminalGymPendulum)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    # save Model
    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {
        'env': ENV_NAME,
        'Eps': maxEpisode,
        'timeStep': maxTimeStep,
        'batch': minibatchSize,
        'gam': gamma,
        'lrActor': learningRateActor,
        'lrCritic': learningRateCritic,
        'noiseVar': noiseInitVariance,
        'varDiscout': varianceDiscount
    }

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                   actorFixedParam)
    getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                    criticFixedParam)
    savePathActor = getActorSavePath(parameters)
    savePathCritic = getCriticSavePath(parameters)

    with actorModel.as_default():
        saveVariables(trainedActorModel, savePathActor)
    with criticModel.as_default():
        saveVariables(trainedCriticModel, savePathCritic)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'pendulumTrajectory1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = True
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow)/2

    actorWeightInit = tf.random_uniform_initializer(0, 0.03)
    actorBiasInit = tf.constant_initializer(0.01)
    criticWeightInit = tf.random_uniform_initializer(0, 0.01)
    cirticBiasInit = tf.constant_initializer(0.01)

    weightInitializerList = [actorWeightInit, actorBiasInit, criticWeightInit, cirticBiasInit]
    buildModel = BuildDDPGModels(stateDim, actionDim, weightInitializerList, actionBound)
    layerWidths = [30]
    writer, model = buildModel(layerWidths)

    trainCriticBySASR = TrainCriticBySASR(learningRateCritic, gamma, writer)
    trainCritic = TrainCritic(reshapeBatchToGetSASR, trainCriticBySASR)

    trainActorFromState = TrainActorFromState(learningRateActor, writer)
    trainActor = TrainActor(reshapeBatchToGetSASR, trainActorFromState)

    paramUpdateInterval = 1 #
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, model)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, model, getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels)

    transit = TransitGymPendulum()
    getReward = RewardGymPendulum(angle_normalize)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminalGymPendulum)

    dirName = os.path.dirname(__file__)
    modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum_newddpg')
    getTrainedModel = lambda: trainModels.getTrainedModels()
    modelSaveRate = 50
    saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath)

    ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel])

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

# plots& plot
    showDemo = False
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()