Exemplo n.º 1
0
    def __init__(self, config, localNet, env, globalNets, globalOptimizer, netLossFunc, nbAction, rank,
                 globalEpisodeCount, globalEpisodeReward, globalRunningAvgReward, resultQueue, logFolder,
                stateProcessor = None, lock = None):

        self.globalPolicyNet = globalNets[0]
        self.globalTargetNet = globalNets[1]
        self.rank = rank
        self.globalOptimizer = globalOptimizer
        self.localNet = localNet

        mp.Process.__init__(self)
        DQNAgent.__init__(self. config, localNet, None, None, netLossFunc, nbAction, stateProcessor, )


        self.totalStep = 0
        self.updateGlobalFrequency = 10
        if 'updateGlobalFrequency' in self.config:
            self.updateGlobalFrequency = self.config['updateGlobalFrequency']


        self.globalEpisodeCount = globalEpisodeCount
        self.globalEpisodeReward = globalEpisodeReward
        self.globalRunningAvgReward = globalRunningAvgReward
        self.resultQueue = resultQueue
        self.dirName = logFolder

        self.randomSeed = 1 + self.rank
        if 'randomSeed' in self.config:
            self.randomSeed = self.config['randomSeed'] + self.rank
        torch.manual_seed(self.randomSeed)

        self.nStepForward = 1
        if 'nStepForward' in self.config:
            self.nStepForward = self.config['nStepForward']
        self.targetNetUpdateEpisode = 10
        if 'targetNetUpdateEpisode' in self.config:
            self.targetNetUpdateEpisode = self.config['targetNetUpdateEpisode']

        self.nStepBuffer = []

        # only use vanilla replay memory
        self.memory = ReplayMemory(self.memoryCapacity)

        self.priorityMemoryOption = False

        # use synthetic lock or not
        self.synchLock = False
        if 'synchLock' in self.config:
            self.synchLock = self.config['synchLock']

        self.lock = lock

        self.device = 'cpu'
        if 'device' in self.config and torch.cuda.is_available():
            self.device = self.config['device']
            torch.cuda.manual_seed(self.randomSeed)
            self.localNet = self.localNet.cuda()
Exemplo n.º 2
0

env = DynamicMazeMultiMap(config)

N_S = env.stateDim[0]
N_A = env.nbActions

policyNet = MulChanConvNet(N_S, 128, N_A)
targetNet = deepcopy(policyNet)
optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(config,
                 policyNet,
                 targetNet,
                 env,
                 optimizer,
                 torch.nn.MSELoss(reduction='none'),
                 N_A,
                 stateProcessor=stateProcessor,
                 experienceProcessor=experienceProcessor)

trainFlag = True
testFlag = True

if trainFlag:

    if config['loadExistingModel']:
        checkpoint = torch.load(config['saveModelFile'])
        agent.policyNet.load_state_dict(checkpoint['model_state_dict'])
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if config['loadCheckpointFlag']:
netParameter['n_output'] = N_A

policyNet = MultiLayerNetRegression(netParameter['n_feature'],
                                    netParameter['n_hidden'],
                                    netParameter['n_output'])

print(policyNet.state_dict())

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(policyNet,
                 targetNet,
                 env,
                 optimizer,
                 torch.nn.MSELoss(),
                 N_S,
                 N_A,
                 config=config)

policy = deepcopy(env.map)
for i in range(policy.shape[0]):
    for j in range(policy.shape[1]):
        if env.map[i, j] == 0:
            policy[i, j] = -1
        else:
            policy[i, j] = agent.getPolicy(np.array([i, j]))

np.savetxt('DoubleQSimpleMazePolicyBeforeTrain' + mapName + '.txt',
           policy,
           fmt='%d',
Exemplo n.º 4
0
config['logFrequency'] = 100
config['priorityMemoryOption'] = False
config['netUpdateOption'] = 'doubleQ'
config['netUpdateFrequency'] = 1
config['priorityMemory_absErrUpper'] = 5

import gym
from pybullet_envs.bullet.racecarGymEnv import RacecarGymEnv

env = RacecarGymEnv(renders=True, isDiscrete=True)
N_S = env.observation_space.shape[0]
N_A = env.action_space.n

netParameter = dict()
netParameter['n_feature'] = N_S
netParameter['n_hidden'] = [100]
netParameter['n_output'] = N_A

policyNet = MultiLayerNetRegression(netParameter['n_feature'],
                                    netParameter['n_hidden'],
                                    netParameter['n_output'])

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(config, policyNet, targetNet, env, optimizer,
                 torch.nn.MSELoss(reduction='none'), N_A)

agent.train()
netParameter = dict()
netParameter['n_feature'] = N_S
netParameter['n_hidden'] = [100]
netParameter['n_output'] = N_A


policyNet = MultiLayerNetRegression(netParameter['n_feature'],
                                    netParameter['n_hidden'],
                                    netParameter['n_output'])
targetNet = MultiLayerNetRegression(netParameter['n_feature'],
                                    netParameter['n_hidden'],
                                    netParameter['n_output'])
optimizers = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(config, policyNet, targetNet, env, optimizers, torch.nn.MSELoss(reduction='none'), N_A, stateProcessor=stateProcessor)

agent.train()





nTraj = 100
nSteps  = 80

# test for starting from second stage
for i in range(nTraj):
    state = agent.env.reset()
    agent.env.stageID = 1
    state['stageID'] = agent.env.stageID
N_A = env.nbActions

netParameter = dict()
netParameter['n_feature'] = N_S
netParameter['n_hidden'] = [100]
netParameter['n_output'] = N_A

policyNet = MultiLayerNetRegression(netParameter['n_feature'],
                                    netParameter['n_hidden'],
                                    netParameter['n_output'])

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(config, policyNet, targetNet, env, optimizer,
                 torch.nn.MSELoss(reduction='none'), N_A)

xSet = np.linspace(-1, 1, 100)
policy = np.zeros_like(xSet)
for i, x in enumerate(xSet):
    policy[i] = agent.getPolicy(np.array([x]))

np.savetxt('StabilizerPolicyBeforeTrain.txt', policy, fmt='%d')

#agent.perform_random_exploration(10)
agent.train()
#storeMemory = ReplayMemory(100000)
agent.testPolicyNet(100)
#storeMemory.write_to_text('testPolicyMemory.txt')

Exemplo n.º 7
0
env.reset()
N_S = env.stateDim[0]
N_A = env.nbActions

policyNet = ConvNet(N_S, N_A)

#print(policyNet.state_dict())

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(policyNet,
                 targetNet,
                 env,
                 optimizer,
                 torch.nn.MSELoss(),
                 N_A,
                 config=config)

policy = deepcopy(env.mapMat)
for i in range(policy.shape[0]):
    for j in range(policy.shape[1]):
        if env.mapMat[i, j] == 1:
            policy[i, j] = -1
        else:
            sensorInfo = env.agent.getSensorInfoFromPos(np.array([i, j]))
            policy[i, j] = agent.getPolicy(sensorInfo)

np.savetxt('DynamicMazePolicyBeforeTrain' + mapName + '.txt',
           policy,
Exemplo n.º 8
0
env = DynamicMaze(config)
env.reset()
N_S = env.stateDim[0]
N_A = env.nbActions


policyNet = MulChanConvNet(N_S, 100, N_A)

#print(policyNet.state_dict())

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])


agent = DQNAgent(policyNet, targetNet, env, optimizer, torch.nn.MSELoss(), N_A,
                 stateProcessor=stateProcessor, config=config)

policy = deepcopy(env.mapMat)
for i in range(policy.shape[0]):
      for j in range(policy.shape[1]):
          if env.mapMat[i, j] == 1:
              policy[i, j] = -1
          else:
              sensorInfo = env.agent.getSensorInfoFromPos(np.array([i,j]))
              distance = np.array([1, 1]) - np.array([i, j])
              state = {'sensor': sensorInfo, 'target': distance}
              policy[i, j] = agent.getPolicy(state)


np.savetxt('DynamicMazePolicyBeforeTrain' + mapName + '.txt', policy, fmt='%d', delimiter='\t')
#
netParameter['n_feature'] = N_S
netParameter['n_hidden'] = [100]
netParameter['n_output'] = N_A

policyNet = MultiLayerNetRegression(netParameter['n_feature'],
                                    netParameter['n_hidden'],
                                    netParameter['n_output'])

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(policyNet,
                 targetNet,
                 env,
                 optimizer,
                 torch.nn.MSELoss(reduction='none'),
                 N_A,
                 config=config)

xSet = np.linspace(-1, 1, 100)
policy = np.zeros_like(xSet)
for i, x in enumerate(xSet):
    policy[i] = agent.getPolicy(np.array([x]))

np.savetxt('StabilizerPolicyBeforeTrain.txt', policy, fmt='%d')

#agent.perform_random_exploration(10)
agent.train()
storeMemory = ReplayMemory(100000)
agent.testPolicyNet(100, storeMemory)
Exemplo n.º 10
0
N_S = env.stateDim[0]
N_A = env.nbActions

policyNet = MulChanConvNet(N_S, 100, N_A)

#print(policyNet.state_dict())

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])

agent = DQNAgent(policyNet,
                 targetNet,
                 env,
                 optimizer,
                 torch.nn.MSELoss(reduction='none'),
                 N_A,
                 stateProcessor=stateProcessor,
                 config=config)

trainFlag = True
testFlag = True

if trainFlag:

    if config['loadExistingModel']:
        checkpoint = torch.load(config['saveModelFile'])
        agent.policyNet.load_state_dict(checkpoint['model_state_dict'])
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    plotPolicyFlag = True