示例#1
0
def init_dqn(args):
    """Intitialises and returns the necessary objects for
       Deep Q-learning:
       Q-network, target network, replay buffer and optimizer.
    """
    logging.info(
        "Initialisaling DQN with architecture {} and optimizer {}".format(
            args.dqn_archi, args.optimizer_agent))
    if args.dqn_archi == 'mlp':
        q_net = DQN(args.obs_shape, args.n_actions, args)
        q_target = DQN(args.obs_shape, args.n_actions, args)
    elif args.dqn_archi == 'cnn':
        q_net = CnnDQN(args.obs_shape, args.n_actions, args)
        q_target = CnnDQN(args.obs_shape, args.n_actions, args)
    if args.optimizer_agent == 'RMSProp':
        optimizer_agent = optim.RMSprop(q_net.parameters(),
                                        lr=args.lr_agent,
                                        weight_decay=args.lambda_agent)
    else:
        assert args.optimizer_agent == 'Adam'
        optimizer_agent = optim.Adam(q_net.parameters(),
                                     lr=args.lr_agent,
                                     weight_decay=args.lambda_agent)
    q_target.load_state_dict(
        q_net.state_dict())  # set params of q_target to be the same
    replay_buffer = ReplayBuffer(args.replay_buffer_size)

    if args.epsilon_annealing_scheme == 'linear':
        epsilon_schedule = LinearSchedule(schedule_timesteps=int(
            args.exploration_fraction * args.n_agent_steps),
                                          initial_p=args.epsilon_start,
                                          final_p=args.epsilon_stop)
    else:
        assert args.epsilon_annealing_scheme == 'exp'
        epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay,
                                       final_p=args.epsilon_stop,
                                       initial_p=args.epsilon_start)

    return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
class Agent():
    def __init__(self, learn_rate, 
            state_shape, num_actions, action_shape, 
            batch_size, slice_size):
        self.gamma = 0.999
        self.tau = 0.01
        self.clip_grad_norm = 0.1
        self.has_target_net = True

        self.state_shape = state_shape
        self.num_actions = num_actions      #   this is how many actions there are to choose from
        self.action_shape = action_shape    #   this is how many actions the env accepts at each step

        self.buffer_size = 1_000_000
        self.batch_size = batch_size    # *times slice_size, because recurrency/rollouts
        self.slice_size = slice_size

        self.slice_replay_buffer = MemorySliceReplayBuffer(
            size=self.buffer_size, slice_size=self.slice_size, 
            state_shape=self.state_shape, action_shape=self.action_shape)
        self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=300)
        # self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30)


        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.net = DQN(state_shape, num_actions).to(self.device)
        if self.has_target_net:
            self.target_net  = copy.deepcopy(self.net).to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)

    def update_target_net_params(self):
        for param, target_param in zip(self.net.parameters(), self.target_net.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def choose_action(self, observation, hidden_state):
            state = torch.tensor(observation).float().unsqueeze(0)
            state = state.detach().to(self.device)

            q_values, hidden_state_ = self.net(state, hidden_state)
            action = torch.argmax(q_values[0]).item()

            if random.random() <= self.epsilon.value():
                action = random.randint(0, self.action_shape[0])

            return action, hidden_state_

    def learn(self, stats):
        if self.slice_replay_buffer.count < self.batch_size:
            return 

        self.net.train()

        states_slices, actions_slices, rewards_slices, next_states_slices, dones_slices = self.slice_replay_buffer.sample(self.batch_size, self.device)

        batch_losses = []
        hidden_states = self.net.get_batch_hidden_state(self.batch_size).to(self.device)

        for slice_index in range(self.slice_size):
            states = states_slices[:, slice_index]
            actions = actions_slices[:, slice_index]
            rewards = rewards_slices[:, slice_index]
            states_ = next_states_slices[:, slice_index]
            dones = dones_slices[:, slice_index]

            batch_indices = np.arange(self.batch_size, dtype=np.int64)
            qs, hidden_states_ = self.net(states, hidden_states)
            chosen_q = qs[batch_indices, actions.T[0]]

            if self.has_target_net:
                qs_, hidden_state_3 = self.target_net(states_, hidden_states_)
                action_qs_, hidden_state_3 = self.net(states_, hidden_states_)
                actions_ = torch.argmax(action_qs_, dim=1)
                chosen_q_ = qs_[batch_indices, actions_]
            else:
                action_qs_, hidden_state_3 = self.net(states_, hidden_states_)
                chosen_q_ = torch.max(action_qs_, dim=1)[0]

            rewards = rewards.T[0]
            q_target = rewards + self.gamma * chosen_q_

            loss = torch.mean( (q_target -  chosen_q) ** 2 )
            batch_losses.append(-loss)

            hidden_states = hidden_states_
            hidden_states[dones.T[0]] = 0.0 #   if an episode ends mid slice then zero the hidden_states
                                            #   this could be a problem if backprop stops here

        batch_losses = torch.stack(batch_losses)
        batch_loss = torch.mean(batch_losses)
        stats.last_loss = batch_loss.item()
        self.optimizer.zero_grad()
        batch_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.clip_grad_norm)
        self.optimizer.step()

        self.epsilon.step()
        if self.has_target_net:
            self.update_target_net_params()
示例#3
0
from collections import deque
import random
import torch
from torch import optim
from tqdm import tqdm
from env import Env
from hyperparams import ACTION_DISCRETISATION, OFF_POLICY_BATCH_SIZE as BATCH_SIZE, DISCOUNT, EPSILON, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, REPLAY_SIZE, TARGET_UPDATE_INTERVAL, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START
from models import DQN, create_target_network
from utils import plot

env = Env()
agent = DQN(HIDDEN_SIZE, ACTION_DISCRETISATION)
target_agent = create_target_network(agent)
optimiser = optim.Adam(agent.parameters(), lr=LEARNING_RATE)
D = deque(maxlen=REPLAY_SIZE)


def convert_discrete_to_continuous_action(action):
    return action.to(dtype=torch.float32) - ACTION_DISCRETISATION // 2


def test(agent):
    with torch.no_grad():
        env = Env()
        state, done, total_reward = env.reset(), False, 0
        while not done:
            action = agent(state).argmax(
                dim=1,
                keepdim=True)  # Use purely exploitative policy at test time
            state, reward, done = env.step(
                convert_discrete_to_continuous_action(action))
示例#4
0
    screen[32:-16:2,::2].mean(axis=2).astype(np.uint8)

### SETUP ###
env = gym.make(game)
win_streak = []
frame_shape = process_frame(env.reset()).shape
state_shape = (frames_number, *frame_shape)

if not torch.cuda.is_available(): print('cuda not available')
device = torch.device('cuda')

net = DQN(state_shape, env.action_space.n).to(device)
target_net = copy(net).to(device)

loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)

training_queue = Queue()
memory = Memory(int(1e+4), training_queue, device)


### UTILITY FUNCTIONS ###
def step(action, reset=False):
    if reset:
        state = [process_frame(env.reset())]
        loops = frames_number - 1
    else:
        state = []
        loops = frames_number

    reward = 0.
示例#5
0
class QAgent:
    def __init__(self, epsilon_start, epsilon_end, epsilon_anneal, nb_actions,
                 learning_rate, gamma, batch_size, replay_memory_size,
                 hidden_size, model_input_size, use_PER, use_ICM):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_anneal_over_steps = epsilon_anneal

        self.num_actions = nb_actions

        self.gamma = gamma

        self.batch_size = batch_size

        self.learning_rate = learning_rate

        self.step_no = 0

        self.policy = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target.load_state_dict(self.policy.state_dict())
        self.target.eval()
        self.hidden_size = hidden_size
        self.optimizer = torch.optim.AdamW(self.policy.parameters(),
                                           lr=self.learning_rate)

        self.use_PER = use_PER
        if use_PER:
            self.replay = Prioritized_Replay_Memory(replay_memory_size)
        else:
            self.replay = Replay_Memory(replay_memory_size)

        self.loss_function = torch.nn.MSELoss()
        self.use_ICM = use_ICM
        if use_ICM:
            self.icm = ICM(model_input_size, nb_actions)

    # Get the current epsilon value according to the start/end and annealing values
    def get_epsilon(self):
        eps = self.epsilon_end
        if self.step_no < self.epsilon_anneal_over_steps:
            eps = self.epsilon_start - self.step_no * \
                ((self.epsilon_start - self.epsilon_end) /
                 self.epsilon_anneal_over_steps)
        return eps

    # select an action with epsilon greedy
    def select_action(self, state):
        self.step_no += 1
        if np.random.uniform() > self.get_epsilon():
            with torch.no_grad():
                return torch.argmax(self.policy(state)).view(1)
        else:
            return torch.tensor([random.randrange(self.num_actions)],
                                device=self.device,
                                dtype=torch.long)

    # update the model according to one step td targets
    def update_model(self):
        if self.use_PER:
            batch_index, batch, ImportanceSamplingWeights = self.replay.sample(
                self.batch_size)
        else:
            batch = self.replay.sample(self.batch_size)

        batch_tuple = Transition(*zip(*batch))

        state = torch.stack(batch_tuple.state)
        action = torch.stack(batch_tuple.action)
        reward = torch.stack(batch_tuple.reward)
        next_state = torch.stack(batch_tuple.next_state)
        done = torch.stack(batch_tuple.done)

        self.optimizer.zero_grad()
        if self.use_ICM:
            self.icm.optimizer.zero_grad()
            forward_loss = self.icm.get_forward_loss(state, action, next_state)
            inverse_loss = self.icm.get_inverse_loss(state, action, next_state)
            icm_loss = (1 - self.icm.beta) * inverse_loss.mean(
            ) + self.ICM.beta * forward_loss.mean()

        td_estimates = self.policy(state).gather(1, action).squeeze()

        td_targets = reward + (1 - done.float()) * self.gamma * \
            self.target(next_state).max(1)[0].detach_()

        if self.use_PER:

            loss = (torch.tensor(ImportanceSamplingWeights, device=self.device)
                    * self.loss_function(td_estimates, td_targets)
                    ).sum() * self.loss_function(td_estimates, td_targets)

            errors = td_estimates - td_targets
            self.replay.batch_update(batch_index, errors.data.numpy())
        else:
            loss = self.loss_function(td_estimates, td_targets)

        if self.use_ICM:
            loss = self.icm.lambda_weight * loss + icm_loss

        loss.backward()

        for param in self.policy.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.use_ICM:
            self.icm.optimizer.step()

        self.optimizer.step()

        return loss.item()

    # set target net parameters to policy net parameters
    def update_target(self):
        self.target.load_state_dict(self.policy.state_dict())

    # save model
    def save(self, path, name):
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, os.path.join(path, name + ".pt"))
        torch.save(self.policy.state_dict(), filename)

    # load a model
    def load(self, path):
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, path)
        self.policy.load_state_dict(torch.load(filename))

    # store experience in replay memory
    def cache(self, state, action, reward, next_state, done):
        self.replay.push(state, action, reward, next_state, done)
示例#6
0
文件: trainer.py 项目: tzyrq/marl_dqn
class Trainer(object):
    def __init__(self, args, n_agents, n_cities, device, data_loader):
        self.n_agents = n_agents
        self.n_cities = n_cities

        self.device = device

        self.args = args
        self.Encoder = Encoder(K=args.steps,
                               M=self.n_cities,
                               L=args.len_encoder).to(self.device)
        self.DQN = DQN(N=self.n_agents,
                       K=args.steps,
                       L=args.len_encoder,
                       M=n_cities).to(self.device)

        self.data_loader = data_loader
        self.iter_data = iter(data_loader)
        self.n_envs = len(data_loader)
        self.idx_env = -1
        self.env = None

        self.EPS_START = self.args.eps_start
        self.EPS_END = self.args.eps_end
        self.EPS_DECAY = self.args.eps_decay

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.DQN.parameters(), lr=args.lr)

    def calc_loss(self, samples):
        self.DQN.train()
        states = []
        next_states = []

        for sample in samples:
            states.append(sample.state.reshape(1, -1))
            next_states.append(sample.next_state.reshape(1, -1))

        states = torch.cat(states)
        next_states = torch.cat(next_states)

        # add one dim at 1 (batch_size, 1, state)
        states = states.unsqueeze(1)
        next_states = next_states.unsqueeze(1)

        with torch.enable_grad():
            Q = self.DQN(states)
            Q_next = self.DQN(next_states)

        temp_Q = []
        temp_Q_next = []

        for i in range(len(samples)):
            action = samples[i].action
            reward = samples[i].reward.to(self.device)
            action_idx = action[0] * self.n_cities + action[1]

            temp_Q.append(Q[i][action_idx].reshape(1))
            temp_Q_next.append((Q_next[i].max() * self.args.gamma +
                                reward).reshape(1).cuda(self.device))

        Q = torch.cat(temp_Q).float().cuda(self.device)
        Q_next = torch.cat(temp_Q_next).float().cuda(self.device)

        loss = self.criterion(Q, Q_next)
        return loss

    def gen_env(self):
        data = next(self.iter_data)
        self.idx_env += 1
        self.env = Env(n_agents=self.n_agents,
                       n_cities=self.n_cities,
                       steps=self.args.steps,
                       conn=data["conn"],
                       tasks=data["tasks"],
                       cities=data["cities"],
                       rewards=data["rewards"],
                       destinations=data["destinations"],
                       budget=self.args.budget)

    def select_action(self, state):
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
                        math.exp(-1. * self.env.steps_done / self.EPS_DECAY)
        actions = []
        for i in range(self.n_agents):
            p = random.random()
            if p > eps_threshold:
                with torch.no_grad():
                    # print("not random")
                    q = self.DQN(state[i].reshape(1, 1, -1)).reshape(2,
                                                                     -1).max(1)
                    if q[0][0] > q[0][1]:
                        action = torch.tensor([0],
                                              device=self.device,
                                              requires_grad=False)
                        action = torch.cat(
                            (action, q[1][0].reshape(1, ).long()))
                    else:
                        action = torch.tensor([1],
                                              device=self.device,
                                              requires_grad=False)
                        action = torch.cat(
                            (action, q[1][1].reshape(1, ).long()))
                    actions.append(action)
            else:
                action = [
                    random.choice([0, 1]),
                    random.randint(0, self.n_cities - 1)
                ]
                actions.append(
                    torch.tensor(action,
                                 device=self.device,
                                 requires_grad=False))
        return actions

    def step(self):
        self.env.steps_done += 1
        x = self.env.input().reshape(self.n_agents, -1).cuda(self.device)
        phi = []
        for i in range(self.n_agents):
            with torch.no_grad():
                phi.append(self.Encoder(x[i]))
        # after encoding
        n = torch.cat(phi, dim=0).reshape(self.n_agents, -1)
        # state
        s = []
        for i in range(self.n_agents):
            ni = torch.cat((n[0:i], n[i + 1:])).reshape(-1)
            s.append(torch.cat((x[i], ni)))
        s = torch.cat(s).reshape(self.n_agents, -1)
        # epsilon-greedy
        actions = self.select_action(s)
        # collect rewards
        rewards = self.env.step(actions)
        if rewards == -1:
            return "done"
        # state_{t+1}
        x_tp1 = self.env.input().reshape(self.n_agents, -1).cuda(self.device)
        phi_tp1 = []
        for i in range(self.n_agents):
            with torch.no_grad():
                phi_tp1.append(self.Encoder(x_tp1[i]))
        n_tp1 = torch.cat(phi_tp1, dim=0).reshape(self.n_agents, -1)
        s_tp1 = []
        for i in range(self.n_agents):
            ni = torch.cat((n_tp1[0:i], n_tp1[i + 1:])).reshape(-1)
            s_tp1.append(torch.cat((x_tp1[i], ni)))
        s_tp1 = torch.cat(s_tp1).reshape(self.n_agents, -1)
        # initial Transition tuple
        res = []
        for i in range(self.n_agents):
            res.append(
                Transition(state=s[i],
                           action=actions[i],
                           next_state=s_tp1[i],
                           reward=rewards[i]))
        return res
示例#7
0
def train_DQN(env: WrapIt, Q: DQN, Q_target: DQN, optimizer: namedtuple,
              replay_buffer: ReplayBuffer, exploration: Schedule):
    """
    @parameters
        Q:
        Q_target:
        optimizer: torch.nn.optim.Optimizer with parameters
        buffer: store the frame
    @return
        None
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    optimizer = optimizer.constructor(Q.parameters(), **optimizer.kwargs)

    num_actions = env.action_space.n
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    LOG_EVERY_N_STEPS = 10000
    last_obs = env.reset(passit=True)

    # Q.getSummary()

    out_count = 0
    bar = tqdm(range(ARGS.timesteps))
    for t in bar:
        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()
        if t > ARGS.startepoch:
            value = select_epsilon_greedy_action(Q, recent_observations,
                                                 exploration, t, num_actions)
            action = value[0, 0]
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        reward = max(-1.0, min(reward, 1.0))
        replay_buffer.store_effect(last_idx, action, reward, done)

        if done:
            obs = env.reset()
        last_obs = obs
        # bar.set_description(f"{obs.shape} {obs.dtype}")

        if (t > ARGS.startepoch and t % ARGS.dqn_freq == 0
                and replay_buffer.can_sample(ARGS.batchsize)):
            bar.set_description("backward")
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             done_mask) = replay_buffer.sample(ARGS.batchsize)
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             not_done_mask) = TENSOR(obs_batch, act_batch, rew_batch,
                                     next_obs_batch, 1 - done_mask)
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             not_done_mask) = TO(obs_batch, act_batch, rew_batch,
                                 next_obs_batch, not_done_mask)

            values = Q(obs_batch)
            current_Q_values = values.gather(
                1,
                act_batch.unsqueeze(1).long()).squeeze()
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = Q_target(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            Q_target_values = rew_batch + (ARGS.gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = Q_target_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            # current_Q_values.backward(d_error.data.unsqueeze(1))
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            if num_param_updates % ARGS.dqn_updatefreq == 0:
                bar.set_description("update")
                Q_target.load_state_dict(Q.state_dict())
class DQNAgent():
    """Interacts with and learns from the environment."""

    def __init__(self, name, state_size, action_size, use_double_dqn=False, use_dueling=False, seed=0, lr_decay=0.9999, use_prioritized_replay=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.name = name
        self.state_size = state_size
        self.action_size = action_size
        self.use_double_dqn = use_double_dqn
        self.use_dueling = use_dueling
        self.seed = random.seed(seed)
        self.use_prioritized_replay = use_prioritized_replay

        # Q-Network
        if use_dueling:
            self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size, seed).to(device)

        self.qnetwork_target.eval()
            
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay)

        # Replay memory
        if self.use_prioritized_replay:
            self.memory = PrioritizedReplayBuffer(BUFFER_SIZE, seed, alpha=0.2, beta=0.8, beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(BUFFER_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Epsilon-greedy action selection
        if random.random() > eps:
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()
        
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        if self.use_prioritized_replay:
            states, actions, rewards, next_states, dones, indices, weights = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            # Get max predicted Q values (for next states) from target model
            if self.use_double_dqn:            
                best_local_actions = self.qnetwork_local(states).max(1)[1].unsqueeze(1)
                Q_targets_next = self.qnetwork_target(next_states).gather(1, best_local_actions).max(1)[0].unsqueeze(1)
            else:
                Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            # Compute Q targets for current states 
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.use_prioritized_replay:
            Q_targets.sub_(Q_expected)
            Q_targets.squeeze_()
            Q_targets.pow_(2)

            with torch.no_grad():
                td_error = Q_targets.detach()
                #td_error.pow_(0.5)
                td_error.mul_(weights)

                self.memory.update_priorities(indices, td_error)

            Q_targets.mul_(weights)
            loss = Q_targets.mean()
        else:                
            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
示例#9
0
class DoubleDQNAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 tau=0.01,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model1 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
            self.model2 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
        else:
            self.model1 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)
            self.model2 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)

        self.optimizer1 = torch.optim.Adam(self.model1.parameters())
        self.optimizer2 = torch.optim.Adam(self.model2.parameters())

    def get_action(self, state, eps=0.20):
        if (np.random.randn() < eps):
            return np.random.choice(self.env.action_space)

        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model1.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())

        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        # resize tensors
        actions = actions.view(actions.size(0), 1)
        dones = dones.view(dones.size(0), 1)

        # compute loss
        curr_Q1 = self.model1.forward(states).gather(1, actions)
        curr_Q2 = self.model2.forward(states).gather(1, actions)

        next_Q1 = self.model1.forward(next_states)
        next_Q2 = self.model2.forward(next_states)
        next_Q = torch.min(
            torch.max(self.model1.forward(next_states), 1)[0],
            torch.max(self.model2.forward(next_states), 1)[0])
        next_Q = next_Q.view(next_Q.size(0), 1)
        expected_Q = rewards + (1 - dones) * self.gamma * next_Q

        loss1 = F.mse_loss(curr_Q1, expected_Q.detach())
        loss2 = F.mse_loss(curr_Q2, expected_Q.detach())

        return loss1, loss2

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss1, loss2 = self.compute_loss(batch)

        self.optimizer1.zero_grad()
        loss1.backward()
        self.optimizer1.step()

        self.optimizer2.zero_grad()
        loss2.backward()
        self.optimizer2.step()
示例#10
0
    memory = per_replay.PrioritizedReplayBuffer(75000,
                                                alpha=0.4,
                                                beta=0.6,
                                                epsilon=0.001)
    action_len = 13

    demos = parse_demo(args.env_name, memory, args.demo_file)
    TARGET_UPDATE = 10
    # instantiating model and optimizer
    policy_net = DQN(64, 64, 512, action_len).to(device)
    target_net = DQN(64, 64, 512, action_len).to(device)
    # if args.load_name is not None:
    # model.load_state_dict(pickle.load(open(args.load_name, 'rb')))
    if not args.no_train:
        optimizer = optim.Adam(policy_net.parameters(), lr=args.lr)

    # instantiating policy object
    if args.no_train:
        args.eps_start = 0.0
        args.eps_end = 0.0
        args.eps_steps = 1

    policy = EpsGreedyPolicy(args.eps_start, args.eps_end, args.eps_steps)

    opt_step = 0

    # pre-training
    if not args.no_train:
        print('Pre-training')
        for i in range(1000):
示例#11
0
class DQNAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)
        self.use_conv = use_conv
        if self.use_conv:
            self.model = ConvDQN(env.observation_space.shape,
                                 len(env.action_space)).to(self.device)
        else:
            self.model = DQN(env.observation_space.shape,
                             len(env.action_space)).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

    def get_action(self, state, eps=0.20):
        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model.forward(state)
        #print(qvals)
        action = np.argmax(qvals.cpu().detach().numpy())
        action = np.max(action)

        if (np.random.randn() < eps):
            return np.random.choice(self.env.action_space)

        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones)
        #print(self.model.forward(states))
        curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1))
        curr_Q = curr_Q.squeeze(1)
        next_Q = self.model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

        loss = self.MSE_loss(curr_Q, expected_Q)
        return loss

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss = self.compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
示例#12
0
import torch.optim as optim

import copy
import pickle

from utils import *
from models import DQN

initial_Q = AER_initial_Q()
# initial_Q = torch.zeros(n_actions, device=device)

policy_net = DQN(recent_k, n_agents, n_actions, initial_Q).to(device)
target_net = DQN(recent_k, n_agents, n_actions, initial_Q).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.RMSprop(policy_net.parameters())

Q = torch.zeros(n_actions, n_actions, n_actions, device=device)
for i in range(n_actions):
    for j in range(n_actions):
        Q[i, j, :] = initial_Q.view(-1)

memory = ReplayMemory(MEM_SIZE)

heat = torch.zeros(n_agents, n_actions, n_actions, device=device)

heat_unique0 = []
heat_freq0 = []
heat_unique1 = []
heat_freq1 = []
示例#13
0
class DQNAgent(BaseAgent):
    """
    Agent with a DQN network.
    """
    def __init__(self,
                 input_dim,
                 output_dim,
                 lr,
                 gamma,
                 max_memory_size,
                 batch_size,
                 eps_start,
                 eps_end,
                 eps_decay,
                 device,
                 linear1_units=64,
                 linear2_units=64,
                 decay_type="linear"):

        super().__init__(max_memory_size, batch_size, eps_start, eps_end,
                         eps_decay, device, decay_type)

        self.model_name = "DQN"
        self.output_dim = output_dim
        self.policy_net = DQN(input_dim, output_dim, linear1_units,
                              linear2_units).to(device)

        # optimizer
        self.optim = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma

    def choose_action(self, state, testing=False):
        """
        Choose an action to perform. Uses eps-greedy approach.
        :param state: current state of the environment
        :param testing: if True, always choose greedy action
        :return: the action chosen
        """
        self.curr_step += 1

        if not testing and np.random.random() < self.curr_eps:
            return np.random.randint(0, self.output_dim)
        else:
            # we're using the network for inference only, we don't want to track the gradients in this case
            with torch.no_grad():
                return self.policy_net(state).argmax().item()

    def learn(self):
        """
        Update the weights of the network.
        :return: the loss
        """
        states, next_states, actions, rewards, dones = self.memory.sample(
            self.batch_size)

        curr_q_vals = self.policy_net(states).gather(1, actions)
        next_q_vals = self.policy_net(next_states).max(
            1, keepdim=True)[0].detach()
        target = (rewards + self.gamma * next_q_vals * (1 - dones)).to(
            self.device)
        loss = F.smooth_l1_loss(curr_q_vals, target)
        self.optim.zero_grad()
        loss.backward()

        self.optim.step()

        return loss.item()

    def set_test(self):
        """ Sets the network in evaluation mode """
        self.policy_net.eval()

    def set_train(self):
        """ Sets the network in training mode """
        self.policy_net.train()

    def save(self, filename):
        """
        Save the network weights.
        :param filename: path
        """
        self.policy_net.save(filename)

    def load(self, filename):
        """
        Load the network weights.
        :param filename: path of the weight file
        """
        self.policy_net.load(filename, self.device)
示例#14
0
文件: dqn.py 项目: chris-lamb/deep-rl
def initialize(game, model_name, warm_start):
    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    capacity = int(1e4)

    # Cold start
    if not warm_start:
        # Initialize model
        model = DQN(in_channels=num_frames, num_actions=num_actions)
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=1.0e-4,
                                  weight_decay=0.01)
        # Initialize replay memory
        memory_buffer = ReplayMemory(capacity)

        # Initialize statistics
        running_reward = None
        running_rewards = []

    # Warm start
    if warm_start:

        data_file = 'results/{}_{}.p'.format(game, model_name)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/{}_{}_ep_{}.p'.format(
                game, model_name, prior_eps)
            with open(model_file, 'rb') as f:
                saved_model = pickle.load(f)
                model, optimizer, memory_buffer = saved_model

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = DQN(in_channels=num_frames, num_actions=num_actions)
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=1.0e-4,
                                      weight_decay=0.01)
            # Initialize replay memory
            memory_buffer = ReplayMemory(capacity)

            running_reward = None
            running_rewards = []

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()

    criterion = torch.nn.MSELoss()

    return env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards
示例#15
0
class Agent:
	def __init__(self):
		self.controller, self.target = DQN(), DQN() # For RL 
		self.vision = VAE()

		if USE_CUDA:
			self.controller.cuda()
			self.target.cuda()
			self.vision.cuda()

		# Init weights based on init function
		self.controller.apply(init_weights)
		self.vision.apply(init_weights)
		# Load model params into target
		self.target.load_state_dict(self.controller.state_dict())
		self.action_number = 0 # actions taken (to determine whether or not to update)
	
		# NOTE: DQN exp buffer should use embeddings generated by vision module
		# The vision module (aka the VAE) has memory consisting of game states
		self.exp_buffer = [] # exp buffer
		self.exp_number = 0 # size of exp buffer so far

		self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE)
		self.loss = nn.SmoothL1Loss()

	# Make an action given a state
	def act(self, state, explore=True):
		self.action_number += 1
		# Update target
		if self.action_number % TARGET_INTERVAL == 0:
			self.target.load_state_dict(self.model.state_dict())

		if explore and np.random.rand() <= EPSILON:
			# Act randomly
			a = np.random.randint(NUM_ACTIONS)
			return a
		
		# Send state to model
		a_vec = self.controller(self.vision.encode(state))
		a = int(torch.argmax(torch.squeeze(a_vec)))
		return a

	def load_params(self):
		# Looks in current directory for params for model and for VAE		 
		if LOAD_CHECKPOINT_VAE:
			try:
				self.vision.load_state_dict(torch.load("VAEparams.pt"))
				print("Loaded checkpoint for VAE")
			except:
				print("Could not load VAE checkpoint")
		if LOAD_CHECKPOINT_DQN:
			try:
				self.controller.load_state_dict(torch.load("DQNparams.pt"))
				self.target.load_state_dict(torch.load("DQNparams.pt"))
				print("Loaded checkpoint for DQN")
			except:
				print("Could not load DQN checkpoint")

	def save_params(self):
		torch.save(agent.controller.state_dict(), "DQNparams.pt")
		torch.save(agent.vision.state_dict(), "VAEparams.pt")

	# clear the buffer
	def clear_exp_buffer(self):
		self.exp_buffer = []
		self.exp_number = 0
		self.vision.memory = []
		self.vision.memory_num = 0

	# Add experience to exp buffer
	def add_exp(self, exp):
		self.vision.remember(exp[0])

		if self.exp_number >= EXP_BUFFER_MAX:
			del self.exp_buffer[0]
		else:
			self.exp_number += 1

		exp[0] = self.vision.encode(exp[0])
		exp[3] = self.vision.encode(exp[3])

		self.exp_buffer.append(exp)

	# Replay gets batch and trains on it
	# Returns [vision loss, controller loss]
	def replay(self, batch_size):
		v_loss, q_loss = 0,0 # Init to 0 in case we need to return without any training

		# Train vision component first
		if self.action_number % VAE_UPDATE_INTERVAL == 0:
			v_loss = self.vision.replay()
	
	# If experience buffer isn't right size yet, don't do anything
		if self.exp_number < EXP_BUFFER_MIN or self.action_number % TRAINING_INTERVAL != 0: return [v_loss, q_loss]

		# Get batch from experience_buffer
		batch = random.sample(self.exp_buffer, batch_size)
		
		s,a,r,s_new,_ = zip(*batch)
		s_new = s_new[:-1] # Remove last

		# First turn batch into something we can run through model
		s = torch.cat(s)
		a = torch.LongTensor(a).unsqueeze(1)
		r = torch.FloatTensor(r)
		s_new = torch.cat(s_new)
		
		if USE_CUDA:
			a = a.cuda()
			r = r.cuda()

		# Get q vals for s (what model outputted) from a
		# .gather gets us q value for specific action a
		pred_q_vals = self.model(s).gather(1,a).squeeze()

		# Having chosen a in s,
		# What is the highest possible reward we can get from s_new?
		# We add q of performing a in s then add best q from next state
		# cat 0 to end for the terminal state
		s_new_q_vals = self.target(s_new).max(1)[0]

		zero = torch.zeros(1)
		if USE_CUDA: zero = zero.cuda()
		s_new_q_vals = torch.cat((s_new_q_vals, zero))
		exp_q_vals = r + s_new_q_vals*GAMMA
		
		myloss = self.loss(pred_q_vals, exp_q_vals)
		self.opt.zero_grad()
		myloss.backward()
		

		if WEIGHT_CLIPPING:
			for param in self.model.parameters():
				param.grad.data.clamp_(-1,1) # Weight clipping avoids exploding gradients

		self.opt.step()
		
		global EPSILON
		if EPSILON > EPSILON_MIN:
			EPSILON *= EPSILON_DECAY 
	
		return [v_loss, myloss.item()]
示例#16
0
class Agent():
    def __init__(self, learn_rate, input_shape, num_actions, batch_size):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.gamma = 0.99
        self.tau = 0.05
        self.has_target_net = False

        self.memories = []
        # self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=2000)
        self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30)

        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.net = DQN().to(self.device)
        if self.has_target_net:
            self.target_net = copy.deepcopy(self.net).to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)

    def update_target_net_params(self):
        for param, target_param in zip(self.net.parameters(),
                                       self.target_net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def choose_action(self, observation, hidden_state):
        state = torch.tensor(observation).float().detach()
        state = state.to(self.device)

        q_values, hidden_state_ = self.net(state, hidden_state)
        action = torch.argmax(q_values).item()

        if random.random() <= self.epsilon.value():
            action = random.randint(0, self.num_actions - 1)

        return action, hidden_state_

    def fetch_batch(self):
        indices = np.random.choice(len(self.memories),
                                   self.batch_size,
                                   replace=False)
        indices = list(indices)

        for idx in indices:
            yield self.memories[idx]

    def store_trajectory(self, trajectory):
        self.memories.append(trajectory)

    def learn(self):
        if len(self.memories) < self.batch_size:
            return

        batch_losses = []
        for memory_idx, memory in enumerate(self.fetch_batch()):
            states, actions, rewards, dones = memory.fetch_on_device(
                self.device)

            self.net.train()

            episode_losses = []
            hidden_state = self.net.get_new_hidden_state().to(self.device)
            second_to_last_memory_index = len(memory.states) - 1
            for i in range(second_to_last_memory_index):
                state = states[i].detach()
                state_ = states[i + 1].detach()
                action = actions[i].detach()
                reward = rewards[i].detach()

                if i == second_to_last_memory_index - 1:
                    done = True
                else:
                    done = False

                qs, hidden_state_ = self.net(state, hidden_state)
                chosen_q = qs[action]

                if self.has_target_net:
                    qs_, hidden_state_3 = self.target_net(
                        state_, hidden_state_)
                    action_qs_, hidden_state_3 = self.net(
                        state_, hidden_state_)
                    action_ = torch.argmax(action_qs_)
                    chosen_q_ = qs_[action_]
                else:
                    action_qs_, hidden_state_3 = self.net(
                        state_, hidden_state_)
                    chosen_q_ = torch.max(action_qs_)
                if done:
                    chosen_q_ = torch.tensor(0.0, dtype=torch.float32).to(
                        self.device)

                q_target = reward + self.gamma * chosen_q_

                loss = (q_target - chosen_q)**2

                episode_losses.append(loss)

                hidden_state = hidden_state_

            episode_loss = sum(episode_losses) / len(episode_losses)
            batch_losses.append(episode_loss)

        batch_loss = sum(batch_losses) / len(batch_losses)
        self.optimizer.zero_grad()
        batch_loss.backward()
        self.optimizer.step()

        for i in range(self.batch_size):
            self.epsilon.step()
        if self.has_target_net:
            self.update_target_net_params()
示例#17
0
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=0.9999,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn',
                 device='cuda:0'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.device = device

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        # Create policy and target DQN models
        self.policy = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'policy',
                          chkpt_dir=self.chkpt_dir)
        self.target = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'target',
                          chkpt_dir=self.chkpt_dir)

        # put on correct device (GPU or CPU)
        self.policy.to(device)
        self.target.to(device)

        # Optimizer
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        # Loss
        self.loss = nn.MSELoss()

    def choose_action(self, observation):
        # Choose an action
        if np.random.random() > self.epsilon:
            state = torch.tensor([observation],
                                 dtype=torch.float).to(self.device)
            actions = self.policy.forward(state)
            action = torch.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = torch.tensor(state).to(self.device)
        rewards = torch.tensor(reward).to(self.device)
        dones = torch.tensor(done).to(self.device)
        actions = torch.tensor(action).to(self.device)
        states_ = torch.tensor(new_state).to(self.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.target.load_state_dict(self.policy.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon *= self.eps_dec

    def save_models(self):
        self.policy.save_checkpoint()

    def load_models(self):
        self.policy.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.policy.forward(states)[indices, actions]
        q_next = self.target.forward(states_).max(dim=1)[0]

        q_next[dones] = 0.0
        q_target = rewards + self.gamma * q_next

        loss = self.loss(q_target, q_pred).to(self.device)
        loss.backward()
        self.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()
示例#18
0
class Agent:
    def __init__(self):
        self.model, self.target = DQN(), DQN()
        if USE_CUDA:
            self.model.cuda()
            self.target.cuda()

        self.exp_buffer = Memory()
        self.exp_number = 0  # size of exp buffer so far
        self.param_updates = 0  # track how many times params updated

        self.opt = torch.optim.RMSprop(self.model.parameters(),
                                       lr=LEARNING_RATE)
        self.loss = nn.SmoothL1Loss()

    # Make an action given a state
    def act(self, state, explore=True):
        if explore and np.random.rand() <= EPSILON:
            # Act randomly
            a = np.random.randint(NUM_ACTIONS)
        else:
            # Send state to model
            a_vec = self.model(state)
            a = int(torch.argmax(torch.squeeze(a_vec)))

        return a

    # clear the buffer
    def clear_exp_buffer(self):
        self.exp_buffer = Memory()
        self.exp_number = 0

    # Add experience to exp buffer
    def add_exp(self, exp):
        self.exp_buffer.add(exp)
        self.exp_number += 1

    # Replay gets batch and trains on it
    def replay(self, batch_size):
        q_loss = 0
        # If experience buffer isn't right size yet, don't do anything
        if self.exp_number < MIN_BUFFER_SIZE: return
        # Get batch from experience_buffer
        batch = self.exp_buffer.get_batch(batch_size)

        s, a, r, s_new, _ = zip(*batch)
        s_new = s_new[:-1]  # Remove last item (it is 'None')
        # First turn batch into something we can run through model
        s = torch.cat(s)
        a = torch.LongTensor(a).unsqueeze(1)
        r = torch.FloatTensor(r).unsqueeze(1)
        s_new = torch.cat(s_new)

        #print(a.shape,r.shape, s.shape, s_new.shape)
        if USE_CUDA:
            a = a.cuda()
            r = r.cuda()

        # Get q vals for s (what model outputted) from a
        # .gather gets us q value for specific action a
        pred_q_vals = self.model(s).gather(1, a)

        # Having chosen a in s,
        # What is the highest possible reward we can get from s_new?
        # We add q of performing a in s then add best q from next state
        # cat 0 to end for the terminal state
        s_new_q_vals = self.target(s_new).max(1)[0]
        zero = torch.FloatTensor(0)
        if USE_CUDA: zero = zero.cuda()

        s_new_q_vals = torch.cat((s_new_q_vals, zero))
        exp_q_vals = r + s_new_q_vals * GAMMA

        myloss = self.loss(pred_q_vals, exp_q_vals)
        self.opt.zero_grad()
        myloss.backward()
        self.opt.step()

        if WEIGHT_CLIPPING:
            for param in self.model.parameters():
                param.grad.data.clamp_(
                    -1, 1)  # Weight clipping avoids exploding gradients

        if self.param_updates % TARGET_UPDATE_INTERVAL == 0:
            self.target.load_state_dict(self.model.state_dict())

        self.param_updates += 1

        global EPSILON
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY

        return myloss.item()
示例#19
0
                                j.add(infomation)
                        if feat == 2:  
                            if receive.edgeCountInfo[edge] < give.edgeCountInfo[edge]: 
                                receive.edgeCountInfo[edge] = give.edgeCountInfo[edge]
                                j.add(infomation)
                    for i in range(num_agent): 
                        if i != give.num and i != receive.num: receive.featureUpdate[i] = receive.featureUpdate[i].union(j)
                    give.featureUpdate[receive.num].clear()
                elif give.num == receive.num: give.featureUpdate[receive.num].clear()

model = DQN(nfeat=num_feature)
# model.load_state_dict(torch.load(lists))  #retrain
model_target = DQN(nfeat=num_feature)
model_target.load_state_dict(model.state_dict())
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr=0.0002)    #
replay = namedtuple('replay',('nextnode','state','action','reward','next_state'))
class Replay_buffer():
    def __init__(self , buffer_size):
        self.buffer_size = buffer_size
        self.buffer = np.zeros(  [buffer_size] , dtype = replay)
        self.index = 0
        self.cur_size = 0
    def push(self,experience):
        self.buffer[self.index] = experience
        self.index = (self.index+1)%self.buffer_size
        if self.cur_size < self.buffer_size:
            self.cur_size += 1
    def sample(self,batch_size):
        sample_index = np.random.choice(np.arange(self.cur_size),size=batch_size,replace=False)
        return self.buffer[sample_index]
示例#20
0
class Agent:
    def __init__(self):
        self.model = DQN()
        self.exp_buffer = []  # exp buffer
        self.exp_number = 0  # size of exp buffer so far

        self.opt = torch.optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.loss = nn.MSELoss()

    # Make an action given a state
    def act(self, state, explore=True):
        if explore and np.random.rand() <= EPSILON:
            # Act randomly
            a = np.random.randint(2)
            return a

        # Send state to model
        state = torch.from_numpy(state).float()
        a_vec = self.model(state)
        a = int(torch.argmax(a_vec))
        return a

    # clear the buffer
    def clear_exp_buffer(self):
        self.exp_buffer = []
        self.exp_number = 0

    # Add experience to exp buffer
    def add_exp(self, exp):
        if self.exp_number == MAX_BUFFER_SIZE:
            del self.exp_buffer[0]
        else:
            self.exp_number += 1

        # Convert numpy arrays to tensor
        exp[0] = torch.from_numpy(exp[0]).float()
        if exp[4] == False: exp[3] = torch.from_numpy(exp[3]).float()

        self.exp_buffer.append(exp)

    # Replay gets batch and trains on it
    def replay(self, batch_size):
        # If experience buffer isn't right size yet, don't do anything
        if self.exp_number < MIN_BUFFER_SIZE: return
        # Get batch from experience_buffer
        batch_ind = list(
            torch.randint(self.exp_number, (batch_size, )).numpy())
        batch = get_sublist(self.exp_buffer, batch_ind)
        q_loss = 0
        # Go through samples
        for s, a, r, s_new, done in batch:

            if done:
                Q_val = r
            else:
                Q_val = r + GAMMA * torch.max(self.model(s_new))

            self.opt.zero_grad()

            Q_pred = self.model(s)
            Q_targ = self.model(s)
            Q_targ[a] = Q_val

            myloss = self.loss(Q_pred, Q_targ)
            myloss.backward()
            q_loss += myloss.item()
            self.opt.step()

        global EPSILON
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY

        return q_loss
示例#21
0
    if sample > eps_threshold:
        obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
        # Use volatile = True if variable is only used in inference mode, i.e. don't save the history
        return model(Variable(obs)).data.max(1)[1].cpu()
    else:
        return torch.IntTensor([[random.randrange(NUM_ACTIONS)]])


# vis = visdom.Visdom(port=8124)

# Initialize target q function and q function
Q = DQN(IMG_C, FRAME_HISTORY_LEN, NUM_ACTIONS).type(dtype)
target_Q = DQN(IMG_C, FRAME_HISTORY_LEN, NUM_ACTIONS).type(dtype)

# Construct Q network optimizer function
optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

# Construct the replay buffer
replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, FRAME_HISTORY_LEN)

###############
# RUN ENV     #
###############
num_param_updates = 0
mean_episode_reward = -float('nan')
best_mean_episode_reward = -float('inf')
last_obs = env.reset()
episodes_rewards = []

for t in count():
    ### Step the env and store the transition