phi) - distance[1] * math.cos(phi) if math.sqrt(dx**2 + dy**2) > agent.env.agent.targetClipLength: angle = math.atan2(dy, dx) dx = agent.env.agent.targetClipLength * math.cos( angle) dy = agent.env.agent.targetClipLength * math.sin( angle) #dx = agent.env.agent.targetClipMap(dx) if dx > 0 else -agent.env.agent.targetClipMap(-dx) #dy = agent.env.agent.targetClipMap(dy) if dy > 0 else -agent.env.agent.targetClipMap(-dy) state = { 'sensor': sensorInfo, 'target': np.array([dx, dy]) } policy[i, j] = agent.getPolicy(state) Qvalue = agent.policyNet( stateProcessor([state], config['device'])[0]) value[i, j] = Qvalue[0, policy[i, j]].cpu().item() np.savetxt('DynamicMazePolicyBeforeTrain' + config['mapName'] + 'phiIdx' + str(phiIdx) + '.txt', policy, fmt='%d', delimiter='\t') np.savetxt('DynamicMazeValueBeforeTrain' + config['mapName'] + 'phiIdx' + str(phiIdx) + '.txt', value, fmt='%.3f', delimiter='\t') # plotPolicy(policy, N_A) agent.train()
agent.env.currentState = np.array([0.0, agent.env.mapWidth / 2 + 1]) state['state']= np.array([0.0, agent.env.mapWidth / 2 + 1]) for step in range(nSteps): action = agent.select_action(agent.policyNet, state, -0.1) nextState, reward, done, info = agent.env.step(action) state = nextState if done: print('finish step:', agent.env.stepCount) print(agent.env.currentState) break policyFlag = True if policyFlag: policy = np.zeros((env.mapHeight, env.mapWidth), dtype=np.int32) value = np.zeros((env.mapHeight, env.mapWidth), dtype=np.float) for i in range(policy.shape[0]): for j in range(policy.shape[1]): state = {'state': np.array([i / agent.env.lengthScale, \ j / agent.env.lengthScale]) } policy[i, j] = agent.select_action(agent.policyNet, state, -0.1) stateTorch = stateProcessor([state], config['device'])[0] Qvalue = agent.policyNet(stateTorch) value[i, j] = Qvalue[0, policy[i, j]].cpu().item() np.savetxt('SimpleMazePolicyAfterTrain_stage.txt', policy, fmt='%d', delimiter='\t') np.savetxt('SimpleMazeValueAfterTrain_stage.txt', value, fmt='%f', delimiter='\t')