예제 #1
0
	def __init__(self):

		self.BATCH_SIZE = 128
		self.GAMMA = 0.99
		self.EPS_START = 1.0
		self.EPS_END = 0.05
		self.EPS_DECAY = 0.000005
		self.TARGET_UPDATE = 5

		self.pretrain_length = self.BATCH_SIZE
		#self.state_size = [55,3]
		
		self.action_size = 3
		self.hot_actions = np.array(np.identity(self.action_size).tolist())
		#self.action_size = len(self.hot_actions)
		self.learning_rate = 0.0005
		#self.total_episodes = 12
		self.max_steps = 1000

		self.env = Environment()

		self.memory_maxsize = 10000

		self.DQNetwork = DQNetwork(learning_rate = self.learning_rate,name = 'DQNetwork')

		self.TargetNetwork = DQNetwork(learning_rate = self.learning_rate , name = 'TargetNetwork')

		self.memory = ReplayMemory(max_size=self.memory_maxsize)

		self.saver = tf.train.Saver()
예제 #2
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 buffer_size=100000,
                 batch_size=64,
                 update_frequency=2,
                 gamma=.99,
                 learning_rate=5e-4,
                 tau=1e-3):
        self.state_size = state_size
        self.action_size = action_size
        self.random = random.seed(seed)
        self.batch_size = batch_size

        self.memory = ReplayBuffer(self.action_size, buffer_size, batch_size,
                                   seed)
        self.time_step = 0
        self.update_frequency = update_frequency

        self.qnetwork_local = DQNetwork(state_size, action_size,
                                        seed).to(device)
        self.qnetwork_target = DQNetwork(state_size, action_size,
                                         seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=learning_rate)

        # hyper-parameters
        self.gamma = gamma
        self.tau = tau
예제 #3
0
    def __init__(self,
                 lr,
                 inputChannels,
                 stateShape,
                 numActions,
                 batchSize,
                 epsilon=1.0,
                 gamma=0.99,
                 layer1Size=1024,
                 layer2Size=512,
                 maxMemSize=100000,
                 epsMin=0.01,
                 epsDecay=5e-4):
        self.lr = lr
        self.epsilon = epsilon
        self.epsMin = epsMin
        self.epsDecay = epsDecay
        self.gamma = gamma
        self.batchSize = batchSize
        self.actionSpace = list(range(numActions))
        self.maxMemSize = maxMemSize

        self.memory = ReplayBuffer(maxMemSize, stateShape)
        self.deepQNetwork = DQNetwork(lr, inputChannels, numActions)
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DQNetwork(state_size, action_size,
                                        seed).to(device)
        self.qnetwork_target = DQNetwork(state_size, action_size,
                                         seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #5
0
class DQAgent():
    def __init__(self,
                 lr,
                 inputChannels,
                 stateShape,
                 numActions,
                 batchSize,
                 epsilon=1.0,
                 gamma=0.99,
                 layer1Size=1024,
                 layer2Size=512,
                 maxMemSize=100000,
                 epsMin=0.01,
                 epsDecay=5e-4):
        self.lr = lr
        self.epsilon = epsilon
        self.epsMin = epsMin
        self.epsDecay = epsDecay
        self.gamma = gamma
        self.batchSize = batchSize
        self.actionSpace = list(range(numActions))
        self.maxMemSize = maxMemSize

        self.memory = ReplayBuffer(maxMemSize, stateShape)
        self.deepQNetwork = DQNetwork(lr, inputChannels, numActions)

    '''
    REENABLE EPSILON GREEDY
    '''

    def chooseAction(self, observation):
        if np.random.random() > self.epsilon:
            state = torch.tensor(observation).float().clone().detach()
            state = state.to(self.deepQNetwork.device)
            state = state.unsqueeze(0)
            policy = self.deepQNetwork(state)
            action = torch.argmax(policy).item()
            return action
        else:
            return np.random.choice(self.actionSpace)

    def storeMemory(self, state, action, reward, nextState, done):
        self.memory.storeMemory(state, action, reward, nextState, done)

    def learn(self):
        if self.memory.memCount < self.batchSize:
            return

        self.deepQNetwork.optimizer.zero_grad()

        stateBatch, actionBatch, rewardBatch, nextStateBatch, doneBatch = \
            self.memory.sample(self.batchSize)
        stateBatch = torch.tensor(stateBatch).to(self.deepQNetwork.device)
        actionBatch = torch.tensor(actionBatch).to(self.deepQNetwork.device)
        rewardBatch = torch.tensor(rewardBatch).to(self.deepQNetwork.device)
        nextStateBatch = torch.tensor(nextStateBatch).to(
            self.deepQNetwork.device)
        doneBatch = torch.tensor(doneBatch).to(self.deepQNetwork.device)

        batchIndex = np.arange(self.batchSize, dtype=np.int64)

        actionQs = self.deepQNetwork(stateBatch)[batchIndex, actionBatch]
        allNextActionQs = self.deepQNetwork(nextStateBatch)
        nextActionQs = torch.max(allNextActionQs, dim=1)[0]
        nextActionQs[doneBatch] = 0.0
        qTarget = rewardBatch + self.gamma * nextActionQs

        loss = self.deepQNetwork.loss(qTarget,
                                      actionQs).to(self.deepQNetwork.device)
        loss.backward()
        self.deepQNetwork.optimizer.step()

        if self.epsilon > self.epsMin:
            self.epsilon -= self.epsDecay
    def __init__(self, level_name):  
        self.level_name = level_name  
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()
        
        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)
        
        # instantiate memory
        self.memory = Memory(max_size=memory_size)
        
        # initialize deque with zero images
        self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4)

        for i in range(pretrain_length):    
            # If it's the first step
            if i == 0:
                state = self.env.reset()        
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state
       
        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)
        
        self.write_op = tf.summary.merge_all()
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DQNetwork(state_size, action_size,
                                        seed).to(device)
        self.qnetwork_target = DQNetwork(state_size, action_size,
                                         seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model

        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #8
0
파일: core.py 프로젝트: ducanhlhp9/rl
# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Q learning hyperparameters
discount_rate = 0.95               # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 100          # Number of experiences the Memory can keep

tf.reset_default_graph()

DQNetwork = DQNetwork(state_size, action_size, learning_rate)

# PART II: GEN MEMORY
print("gen memory")

# class Memory():
#     def __init__(self, max_size):
#         self.buffer = deque(maxlen = max_size)
    
#     def add(self, experience):
#         self.buffer.append(experience)
    
#     def sample(self, batch_size):
#         buffer_size = len(self.buffer)
#         index = np.random.choice(np.arange(buffer_size),
#                                 size = batch_size,
예제 #9
0
class YellowBananaThief:
    """ A smart agent that interacts with the environment to pick up yellow bananas"""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 buffer_size=100000,
                 batch_size=64,
                 update_frequency=2,
                 gamma=.99,
                 learning_rate=5e-4,
                 tau=1e-3):
        self.state_size = state_size
        self.action_size = action_size
        self.random = random.seed(seed)
        self.batch_size = batch_size

        self.memory = ReplayBuffer(self.action_size, buffer_size, batch_size,
                                   seed)
        self.time_step = 0
        self.update_frequency = update_frequency

        self.qnetwork_local = DQNetwork(state_size, action_size,
                                        seed).to(device)
        self.qnetwork_target = DQNetwork(state_size, action_size,
                                         seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=learning_rate)

        # hyper-parameters
        self.gamma = gamma
        self.tau = tau

    def act(self, state, epsilon):
        """ Returns an epsilon greedy action to take in the current state
            :param state: The current state in the environment
            :param epsilon: Epsilon value to apply epsilon-greedy action selection
        """
        def action_probabilities(action_vals, eps, num_actions):
            """ Determine the epsilon probabilities of choosing actions """
            probs = np.ones(num_actions, dtype=float) * (eps / num_actions)
            best_action = np.argmax(action_vals)
            probs[best_action] += (1. - eps)
            return probs

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval(
        )  # get the network in evaluation mode and pull values from it
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()  # get the network back into train mode

        action_probs = action_probabilities(action_values.cpu().data.numpy(),
                                            epsilon, self.action_size)
        return np.random.choice(np.arange(self.action_size), p=action_probs)

    def step(self, state, action, reward, next_state, done):
        """ Step forward to train the model """
        self.memory.add(state, action, reward, next_state, done)
        self.time_step = (self.time_step + 1) % self.update_frequency
        if self.time_step == 0:
            if len(self.memory) > self.batch_size:
                # enough samples have been collected for learning from experience
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """ Train the agent from a sample of experiences """
        def soft_update(local_model, target_model, tau):
            """Soft update model parameters.
                   θ_target = τ*θ_local + (1 - τ)*θ_target

                   Params
                   ======
                       local_model (PyTorch model): weights will be copied from
                       target_model (PyTorch model): weights will be copied to
                       tau (float): interpolation parameter
                   """
            for target_param, local_param in zip(target_model.parameters(),
                                                 local_model.parameters()):
                target_param.data.copy_(tau * local_param.data +
                                        (1.0 - tau) * target_param.data)

        states, actions, rewards, next_states, dones = experiences

        # max predicted Q values for the next state
        q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Q targets for current state
        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))

        # get expected q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions)

        # compute model loss
        loss = F.mse_loss(q_expected, q_targets)

        # minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def local_qnet(self):
        """ Returns the trained model """
        return self.qnetwork_local
예제 #10
0
import tensorflow as tf

import numpy as np

from environment import Environment

from model import DQNetwork

env = Environment()

DQNetwork = DQNetwork(learning_rate=0)

with tf.Session() as sess:

    total_test_rewards = []

    saver = tf.train.Saver()

    saver.restore(sess, "./models/model.ckpt")

    for episode in range(1):

        total_rewards = 0

        state = env.reset()

        done = False

        while not done:

            state = state.reshape(