示例#1
0
    def __init__(self, state_size, action_size, seed, device, params):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.params = params

        # Q-Network
        self.qnetwork_local = qn.QNetwork(state_size, action_size,
                                          seed).to(device)
        self.qnetwork_target = qn.QNetwork(state_size, action_size,
                                           seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=params['LR'])

        # Replay memory
        self.memory = rp.ReplayBuffer(action_size, params['BUFFER_SIZE'],
                                      params['BATCH_SIZE'], seed, device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
示例#2
0
    def __init__(self):
        self.replay_buffer = replaybuffer.ReplayBuffer(5000)

        self.env = PendulumEnv()

        observation = self.env.reset()

        self.device = torch.device("cuda")

        #INSTANCIATE MODELS
        state_size = 3
        action_size = 1
        self.state_dreamer = models.StateDreamer(state_size, action_size)
        self.reward_dreamer = models.RewardDreamer(state_size)
        self.actor = models.Actor(state_size, action_size)
        self.critic = models.Critic(state_size, action_size)

        #put models on device
        self.state_dreamer.to(self.device)
        self.reward_dreamer.to(self.device)
        self.actor.to(self.device)
        self.critic.to(self.device)

        #create optimiser for each model
        self.state_dreamer_optimizer = optim.SGD(
            self.state_dreamer.parameters(), lr=0.01, momentum=0.9)
        self.reward_dreamer_optimizer = optim.SGD(
            self.reward_dreamer.parameters(), lr=0.01, momentum=0.9)
        self.actor_optimizer = optim.SGD(self.actor.parameters(),
                                         lr=0.0001,
                                         momentum=0.9)
        self.critic_optimizer = optim.SGD(self.critic.parameters(),
                                          lr=0.001,
                                          momentum=0.9)
示例#3
0
 def __init__(self, agent, environment, max_size, episodic=True):
     self.agent = agent
     self.environment = environment
     self.max_size = max_size
     self.replay_buffer = rb.ReplayBuffer(max_size, agent.state_dim,
                                          agent.action_dim)
     self.episodic = episodic
示例#4
0
 def __init__(self, state_dim, action_dim):
     self.replay_buffer = replaybuffer.ReplayBuffer()
     self.q_network = Q_Network(state_dim, action_dim,16)
     self.q_network_target = copy.deepcopy(self.q_network)
     self.q_network_optim = torch.optim.Adam(
         self.q_network.parameters(), lr=0.001)
     self.criterion = nn.MSELoss()
     self.state_dim = state_dim
     self.action_dim = action_dim
     self.discount = 0.995
     self.tau = 0.05
示例#5
0
 def __init__(self, state_dim, goal_dim, action_dim):
     self.replay_buffer = replaybuffer.ReplayBuffer()
     self.q_network = Goal_Based_Q_Network(state_dim, goal_dim, action_dim,256)
     self.q_network_target = copy.deepcopy(self.q_network)
     self.q_network_optimizer = torch.optim.Adam(
         self.q_network.parameters(), lr=0.001, weight_decay=1e-3)
     self.criterion = nn.MSELoss()
     self.state_dim = state_dim
     self.goal_dim = goal_dim
     self.acion_dim = action_dim
     self.discount = 0.99
     self.tau = 0.05
示例#6
0
 def test_append(self):
     max_buffer_size = 5
     buffer = replaybuffer.ReplayBuffer((2, 2), max_buffer_size)
     # inputs
     old_state = np.arange(4).reshape((2, 2))
     new_state = old_state * 10
     action = 0
     reward = 1
     buffer.append(old_state, new_state, action, reward)
     self.assertEqual(np.all(buffer.old_state[0] == old_state), True)
     self.assertEqual(np.all(buffer.old_state[1] == old_state), False)
     self.assertEqual(np.all(buffer.new_state[0] == old_state), False)
     self.assertEqual(buffer.old_state.shape, (max_buffer_size, 2, 2))
示例#7
0
 def test_shuffle(self):
     # this should return true only half of the time
     max_buffer_size = 3
     buffer = replaybuffer.ReplayBuffer((2, 2),
                                        max_buffer_size=max_buffer_size)
     old_state = np.arange(4).reshape((2, 2))
     new_state = old_state * 10
     action = 2
     reward = 1
     buffer.append(old_state, new_state, action, reward)
     buffer.append(old_state * 2, new_state * 2, action * 2, reward * 2)
     states_shuffled = np.array([old_state, old_state * 2])
     np.random.shuffle(states_shuffled)
     buffer.shuffle()
     old_state, _, _, _ = buffer.next_batch(1)
     self.assertEqual(np.all(states_shuffled[0] == old_state), True)
示例#8
0
    def test_next_batch(self):
        max_buffer_size = 3
        buffer = replaybuffer.ReplayBuffer((2, 2),
                                           max_buffer_size=max_buffer_size)
        # inputs
        old_state = np.arange(4).reshape((2, 2))
        new_state = old_state * 10
        action = 2
        reward = 1
        self.assertEqual(buffer.empty(), True)
        self.assertEqual(buffer.full(), False)
        # test indices
        self.assertEqual(buffer.write_idx, 0)
        buffer.append(old_state, new_state, action, reward)
        self.assertEqual(buffer.write_idx, 1)
        buffer.append(old_state * 2, new_state * 2, action * 2, reward * 2)
        # test empty flag
        self.assertEqual(buffer.empty(), False)
        self.assertEqual(buffer.full(), False)
        buffer.append(old_state * 3, new_state * 3, action * 3, reward * 3)
        self.assertEqual(buffer.empty(), False)
        self.assertEqual(buffer.full(), True)

        self.assertEqual(buffer.write_idx, 3)
        self.assertEqual(buffer.read_idx, 0)
        # rest next_batch results (remainder size)
        old_state, new_state, action, reward = buffer.next_batch(2)
        self.assertEqual(buffer.read_idx, 2)
        self.assertEqual(old_state.shape, (2, 2, 2))
        self.assertEqual(new_state.shape, (2, 2, 2))
        self.assertEqual(np.all(action == [2, 4]), True)
        self.assertEqual(np.all(reward == [1, 2]), True)
        self.assertEqual(buffer.empty(), False)
        old_state, new_state, action, reward = buffer.next_batch(2)
        self.assertEqual(buffer.empty(), True)
        # test next_batch on empty
        return buffer
示例#9
0
 def test_init(self):
     # dont use prepare buffer here
     buffer = replaybuffer.ReplayBuffer((2, 2), 2)
     self.assertEqual(buffer.old_state.shape, (2, 2, 2))
     self.assertEqual(buffer.new_state.shape, (2, 2, 2))
示例#10
0
from IPython.display import clear_output
import matplotlib.pyplot as plt

import cnndqn
import replaybuffer

USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)

model = cnndqn.CnnDQN((8, 8), 8*8)


optimizer = optim.Adam(model.parameters(), lr=0.00001)

replay_initial = 300
replay_buffer = replaybuffer.ReplayBuffer(10000)

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

n = 8  # board size (even)

board = [['0' for x in range(n)] for y in range(n)]
# 8 directions
dirx = [-1, 0, 1, -1, 1, -1, 0, 1]
diry = [-1, -1, -1, 0, 0, 1, 1, 1]

opt = 2