示例#1
0
class DDPG(object):
    """
    Deep Deterministic Policy Gradient Algorithm
    """
    def __init__(self, env, writer=None):
        self.env = env
        self.writer = writer

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        self.max_action = env.action_space.high[0]

        # Randomly initialize network parameter
        self.actor = Actor(state_dim, action_dim).to('cuda')
        self.critic = Critic(state_dim, action_dim).to('cuda')

        # Initialize target network parameter
        self.target_actor = Actor(state_dim, action_dim).to('cuda')
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic = Critic(state_dim, action_dim).to('cuda')
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Replay memory
        self.memory = ReplayMemory(state_dim, action_dim)

        self.gamma = gamma
        self.criterion = nn.MSELoss()
        self.tau = tau

        # network parameter optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)

    def get_action(self, state, ou_noise=None, timestep=None):
        # When test
        if ou_noise is None:
            return self.actor(torch.from_numpy(state).to(
                'cuda', torch.float)).to('cpu').detach().numpy().copy()
        # When train
        action = self.actor(torch.from_numpy(state).to('cuda', torch.float))
        noise = ou_noise(timestep)
        return np.clip(action.to('cpu').detach().numpy().copy() + noise, -1, 1)

    def store_transition(self, state, action, state_, reward, done):
        self.memory.store_transition(state, action, state_, reward, done)

    def soft_update(self, target_net, net):
        """Target parameters soft update"""
        for target_param, param in zip(target_net.parameters(),
                                       net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def update(self, time_step, batch_size=64):
        """Network parameter update"""
        if len(self.memory) < batch_size:
            return

        states, actions, states_, rewards, terminals = self.memory.sample(
            batch_size)

        # Calculate expected value
        with torch.no_grad():
            y = rewards.unsqueeze(1) + terminals.unsqueeze(1) * self.gamma * \
                self.target_critic(states_, self.target_actor(states_))

        # Update Critic
        q = self.critic(states, actions)
        critic_loss = self.criterion(q, y)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        if self.writer:
            self.writer.add_scalar("loss/critic", critic_loss.item(),
                                   time_step)

        # Update Actor (Policy Gradient)
        actor_loss = -self.critic(states, self.actor(states)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        if self.writer:
            self.writer.add_scalar("loss/actor", actor_loss.item(), time_step)

        # target parameter soft update
        self.soft_update(self.target_actor,
                         self.actor)  # update target actor network
        self.soft_update(self.target_critic,
                         self.critic)  # update target critic network

    def save_model(self, path='models/'):
        torch.save(self.actor.state_dict(), path + 'actor')
        torch.save(self.critic.state_dict(), path + 'critic')
        torch.save(self.target_actor.state_dict(), path + 'target_actor')
        torch.save(self.target_critic.state_dict(), path + 'target_critic')

    def load_model(self, path='models/'):
        self.actor.load_state_dict(torch.load(path + 'actor'))
        self.critic.load_state_dict(torch.load(path + 'critic'))
        self.target_actor.load_state_dict(torch.load(path + 'target_actor'))
        self.target_critic.load_state_dict(torch.load(path + 'target_critic'))
示例#2
0
class DQN_agent:
    def __init__(self,env,policy,target,n_action=18,capacity=100000,batch_size=32,lr=2.5e-4,gamma=0.99,burn_in=50000,C=1000,eps_decay=1000000):
        self.env=env
        self.n_action=n_action
        self.memory=ReplayMemory(capacity)
        self.device="cuda"
        self.policy=policy
        self.target=target
        self.batch_size=batch_size
        self.gamma=gamma
        self.lr=lr
        self.opt= optim.Adam(self.policy.parameters(), lr=self.lr)
        self.burn_in=burn_in
        self.C=C
        self.eps_decay=eps_decay
        self.loss=nn.MSELoss()
    def get_state(self,obs):
        state=torch.FloatTensor(np.array(obs).transpose(2,0,1)).unsqueeze(0)
        return(state)
    def get_action(self,state,eps):
        x=random.random()
        if x<eps:
            return(torch.tensor([[random.randrange(self.n_action)]], dtype=torch.long))
        else:
            with torch.no_grad():
                return(self.policy(state.to("cuda")).max(1)[1].view(1,1))
    def update_policy(self):
        state,action,reward,next_state,done=self.memory.sample(self.batch_size)
        state=state.to("cuda")
        action=action.to("cuda")
        next_state=next_state.to("cuda")
        reward=reward.to("cuda")
        done=done.to("cuda")
        q=self.policy(state).gather(1,action.unsqueeze(1)).squeeze(1)
        q_max=self.target(next_state).max(1)[0]
        y=(reward+self.gamma*q_max)*(1-done)+reward*done
        loss=self.loss(q,y)
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
        return
    def update_target(self):
        self.target.load_state_dict(self.policy.state_dict())
    def train(self,episodes):
        steps=0
        reward_list=[]
        for episode in range(episodes):
            obs=self.env.reset()
            state=self.get_state(obs)
            reward_episode=0
            done=False
            while not done:
                steps+=1
                test_eps=int(steps>self.eps_decay)
                eps=(1-steps*(1-0.1)/self.eps_decay)*(1-test_eps)+0.1*test_eps
                action=self.get_action(state,eps)
                obs,reward,done,info=env.step(action)
                reward_episode+=reward
                next_state=self.get_state(obs)
                reward = torch.tensor([reward], device="cpu", dtype=torch.float)
                action = torch.tensor([action], device="cpu", dtype=torch.long)
                done = torch.tensor([int(done)], device="cpu", dtype=int)
                self.memory.push(state,action,reward,next_state,done)
                if steps>self.burn_in:
                    self.update_policy()
                if steps>self.burn_in and steps%self.C==0:
                    self.update_target()
                state=next_state
            if episode%100 == 0:
                print('Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format(steps, episode, episodes, np.mean(reward_list[-100:])))
            if episode%500==0:
                print(reward_list)
            reward_list.append(reward_episode)
        self.env.close()
        print(reward_list)
        return(reward_list)
    def save_model(self,name):
        torch.save(self.policy,name)
        return
    def load_model(self,name):
        self.policy=torch.load(name)
    def test(self,n_episodes):
        test_reward=[]
        for episode in range(n_episodes):
            obs = self.env.reset()
            state = self.get_state(obs)
            reward_episode = 0.0
            done=False
            while not done:
                with torch.no_grad():
                    action=self.policy(state.to("cuda")).max(1)[1].view(1,1)
                obs,reward,done,infoself.=env.step(action)
                reward_episode+=reward
                state=self.get_state(obs)
                if done:
                    print("Finished Episode {} with reward {}".format(episode, reward_episode))
            self.env.close()
            test_reward.append(reward_episode)
        return (test_reward)
示例#3
0
    args.device = torch.device('cpu')


# Simple ISO 8601 timestamped logger
def log(s):
    print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s)


# Environment
env = Env(args)
env.train()
action_space = env.action_space()

# Agent
dqn = Agent(args, env)
mem = ReplayMemory(args, args.memory_capacity)
priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                         args.learn_start)

# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size:
    if done:
        state, done = env.reset(), False

    next_state, _, done = env.step(random.randint(0, action_space - 1))
    val_mem.append(state, None, None, done)
    state = next_state
    T += 1
示例#4
0
class MADDPGAgent(Agent):
    def __init__(self, index, name, env, actor, critic, params):
        self.index = index
        self.name = name
        self.env = env

        self.actor = actor.to(DEVICE)
        self.critic = critic.to(DEVICE)
        self.actor_target = actor.clone().to(DEVICE)
        self.critic_target = critic.clone().to(DEVICE)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=params.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=params.lr_critic)
        self.memory = ReplayMemory(params.memory_size, params.max_episode_len,
                                   self.actor.n_outputs, self.actor.n_inputs)
        self.mse = torch.nn.MSELoss()

        # params
        self.batch_size = params.batch_size
        self.tau = params.tau
        self.gamma = params.gamma
        self.clip_grads = True

        # flags
        # local obs/actions means only the obs/actions of this agent are available
        # if obs and actions are local this is equivalent to DDPG
        self.local_obs = params.local_obs
        self.local_actions = params.local_actions or params.local_obs

        # agent modeling
        self.use_agent_models = params.use_agent_models
        self.agent_models = {}
        self.model_optims = {}
        self.model_lr = params.modeling_lr
        self.entropy_weight = 1e-3
        self.max_past = params.max_past
        self.modeling_train_steps = params.modeling_train_steps
        self.modeling_batch_size = params.modeling_batch_size
        self.model_class = Actor

        # action and observation noise
        self.obfuscate_others = (params.sigma_noise
                                 is not None) or (params.temp_noise
                                                  is not None)
        self.sigma_noise = params.sigma_noise
        self.temp_noise = params.temp_noise

    def init_agent_models(self, agents):
        for agent in agents:
            if agent is self:
                continue
            agent_model = self.model_class.from_actor(agent.actor).to(DEVICE)
            self.agent_models[agent.index] = agent_model
            optim = torch.optim.Adam(agent_model.parameters(),
                                     lr=self.model_lr)
            self.model_optims[agent.index] = optim

    def update_params(self, target, source):
        zipped = zip(target.parameters(), source.parameters())
        for target_param, source_param in zipped:
            updated_param = target_param.data * (1.0 - self.tau) + \
                source_param.data * self.tau
            target_param.data.copy_(updated_param)

    def act(self, obs, explore=True):
        obs = torch.tensor(obs, dtype=torch.float,
                           requires_grad=False).to(DEVICE)
        actions = self.actor.select_action(obs, explore=explore).detach()
        return actions.to('cpu').numpy()

    def experience(self, episode_count, obs, action, reward, new_obs, done):
        self.memory.add(episode_count, obs, action, reward, new_obs,
                        float(done))

    def train_actor(self, batch):
        ### forward pass ###
        pred_actions = self.actor.select_action(batch.observations[self.index])
        actions = list(batch.actions)
        actions[self.index] = pred_actions
        q_obs = [batch.observations[self.index]
                 ] if self.local_obs else batch.observations
        q_actions = [actions[self.index]] if self.local_actions else actions
        pred_q = self.critic(q_obs, q_actions)

        ### backward pass ###
        p_reg = torch.mean(
            self.actor.forward(batch.observations[self.index])**2)
        loss = -pred_q.mean() + 1e-3 * p_reg
        self.actor_optim.zero_grad()
        loss.backward()
        if self.clip_grads:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
        self.actor_optim.step()
        return loss

    def train_critic(self, batch, agents):
        """Train critic with TD-target."""
        ### forward pass ###
        # (a_1', ..., a_n') = (mu'_1(o_1'), ..., mu'_n(o_n'))
        self_obs = batch.next_observations[self.index]
        self_action = self.actor_target.select_action(self_obs).detach()
        if self.local_actions:
            pred_next_actions = [self_action]
        elif self.use_agent_models:
            pred_next_actions = [
                m.select_action(batch.next_observations[idx]).detach()
                for idx, m in self.agent_models.items()
            ]
            pred_next_actions.insert(self.index, self_action)
        else:
            pred_next_actions = [
                a.actor_target.select_action(o).detach()
                for o, a in zip(batch.next_observations, agents)
            ]

        q_next_obs = [batch.next_observations[self.index]
                      ] if self.local_obs else batch.next_observations
        q_next = self.critic_target(q_next_obs, pred_next_actions)
        reward = batch.rewards[self.index]
        done = batch.dones[self.index]

        # if not done: y = r + gamma * Q(o_1, ..., o_n, a_1', ..., a_n')
        # if done:     y = r
        q_target = reward + (1.0 - done) * self.gamma * q_next

        ### backward pass ###
        # loss(params) = mse(y, Q(o_1, ..., o_n, a_1, ..., a_n))
        q_obs = [batch.observations[self.index]
                 ] if self.local_obs else batch.observations
        q_actions = [batch.actions[self.index]
                     ] if self.local_actions else batch.actions
        loss = self.mse(self.critic(q_obs, q_actions), q_target.detach())

        self.critic_optim.zero_grad()
        loss.backward()
        if self.clip_grads:
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optim.step()
        return loss

    def train_models(self, batch, agents):
        for idx, model in self.agent_models.items():
            obs = batch.observations[idx]
            actions = batch.actions[idx]
            distributions = model.prob_dists(obs)
            split_actions = torch.split(actions,
                                        agents[idx].actor.action_split,
                                        dim=-1)
            self.model_optims[idx].zero_grad()
            losses = torch.zeros(len(distributions))
            for i, (actions,
                    dist) in enumerate(zip(split_actions, distributions)):
                entropy = dist.base_dist._categorical.entropy()
                loss = (dist.log_prob(actions).mean() +
                        self.entropy_weight * entropy).mean()
                losses[i] = loss
            loss = -torch.mean(losses)
            loss.backward()
            self.model_optims[idx].step()
            return loss

    def compare_models(self, agents, batch):
        kls = []
        for idx, model in self.agent_models.items():
            kls.append([])
            obs = batch.observations[idx]
            modelled_distributions = model.prob_dists(obs)
            agent_distributions = agents[idx].actor.prob_dists(obs)
            for model_dist, agent_dist in zip(modelled_distributions,
                                              agent_distributions):
                kl_div = torch.distributions.kl.kl_divergence(
                    agent_dist, model_dist).data
                kls[-1].append(kl_div.mean())
        return zip(self.agent_models.keys(), kls)

    def add_noise_(self, batch):
        for i in range(len(batch.actions)):
            if i == self.index:
                continue
            # get observations and actions for agent i
            obs = batch.observations[i]
            actions = batch.actions[i]
            # create noise tensors, same shape and on same device
            if self.sigma_noise is not None:
                obs = obs + torch.randn_like(obs) * self.sigma_noise
            if self.temp_noise is not None:
                temp = torch.tensor(self.temp_noise,
                                    dtype=torch.float,
                                    device=actions.device)
                # avoid zero probs which lead to nan samples
                probs = actions + 1e-45
                actions = RelaxedOneHotCategorical(temp, probs=probs).sample()
            # add noise
            batch.observations[i] = obs
            batch.actions[i] = actions

    def update(self, agents):
        # collect transistion memories form all agents
        memories = [a.memory for a in agents]

        # train model networks
        if self.use_agent_models:
            model_losses = []
            for _ in range(self.modeling_train_steps):
                batch = self.memory.sample_transitions_from(
                    memories, self.modeling_batch_size, max_past=self.max_past)
                if self.obfuscate_others:
                    self.add_noise_(batch)
                model_losses.append(self.train_models(batch, agents).data)
            model_loss = np.mean(model_losses)
            model_kls = self.compare_models(agents, batch)
        else:
            model_loss = None
            model_kls = None

        # sample minibatch
        batch = self.memory.sample_transitions_from(memories, self.batch_size)
        if self.obfuscate_others:
            self.add_noise_(batch)
        # train actor and critic network
        actor_loss = self.train_actor(batch)
        critic_loss = self.train_critic(batch, agents)

        # update target network params
        self.update_params(self.actor_target, self.actor)
        self.update_params(self.critic_target, self.critic)

        return actor_loss, critic_loss, model_loss, model_kls

    def get_state(self):
        if self.agent_models:
            models = {i: m.state_dict() for i, m in self.agent_models.items()}
            optims = {i: o.state_dict() for i, o in self.model_optims.items()}
            model_pair = (models, optims)
        else:
            model_pair = None
        return {
            'actor': self.actor.state_dict(),
            'actor_target': self.actor_target.state_dict(),
            'actor_optim': self.actor_optim.state_dict(),
            'critic': self.critic.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'critic_optim': self.critic_optim.state_dict(),
        }, model_pair

    def load_state(self, state):
        for key, value in state['state_dicts'].items():
            getattr(self, key).load_state_dict(value)
        if 'models' in state:
            models, optims = state['models']
            for i, m in models.items():
                self.agent_models[i].load_state_dict(m)
            for i, o in optims.items():
                self.model_optims[i].load_state_dict(o)
示例#5
0
文件: main.py 项目: BYU-PCCL/vae-rl
  args.device = torch.device('cuda')
  torch.cuda.manual_seed(np.random.randint(1, 10000))
  torch.backends.cudnn.enabled = False
else:
  args.device = torch.device('cpu')

def log(s):
  print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s)

env = Env(args)
env.train()
action_space = env.action_space()

dqn = Agent(args, env)

mem = ReplayMemory(args, args.memory_capacity)
priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start)

val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size:
  if done:
    state, done = env.reset(), False
  next_state, _, done = env.step(np.random.randint(0, action_space))
  val_mem.append(state, None, None, done)
  state = next_state
  T += 1

if args.evaluate:
  dqn.eval()
  avg_reward, avg_Q, env = test(args, 0, dqn, val_mem, env, evaluate=True)
示例#6
0
			if np.random.random() < epsilon:
				action = np.random.randint(4)

			for i in range(self.action_repeat):
				reward = self.environment.act(action)
				total_score += reward
				self.environment.update_screen()


		return total_score


sess = tf.InteractiveSession()
counter = Counter(7000000)

replay_memory = ReplayMemory(1000000)
dqn_agent = DQNAgent((84,84,4), NATURE, 4, replay_memory, counter, tf_session=sess)
agent = EpsilonAgent(dqn_agent, 4, counter)
agi = AtariGameInterface('Breakout.bin', agent, replay_memory, counter)

# Create a Tensorboard monitor and populate with the desired summaries
tensorboard_monitor = TensorboardMonitor('./log', sess, counter)
tensorboard_monitor.add_scalar_summary('score', 'per_game_summary')
tensorboard_monitor.add_scalar_summary('training_loss', 'training_summary')
for i in range(4):
	tensorboard_monitor.add_histogram_summary('Q%d_training' % i, 'training_summary')

checkpoint_monitor = CheckpointRecorder(dqn_agent.dqn, replay_memory, counter, './checkpoints', sess)
agi.add_listener(checkpoint_monitor)
agi.add_listener(tensorboard_monitor)
dqn_agent.add_listener(tensorboard_monitor)
示例#7
0
    torch.backends.cudnn.benchmark = True


# Simple timestamped logger
def log(s):
    print('[' + str(datetime.now().time()) + '] ' + s)


# Environment
env = Env(args)
env.train()
action_space = env.action_space()

# Agent
dqn = Agent(args, env)
mem = ReplayMemory(args, args.memory_capacity)
priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                         args.learn_start)

# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size - args.history_length + 1:
    if done:
        state, done = env.reset(), False
        val_mem.preappend()  # Set up memory for beginning of episode

    val_mem.append(state, None, None)
    state, _, done = env.step(random.randint(0, action_space - 1))
    T += 1
    # No need to postappend on done in validation memory
示例#8
0
class DQNAgent:
    def __init__(self, environment):
        self.env = environment
        self.memory = ReplayMemory(MEMORY_CAPACITY)
        self.dim_actions = self.env.action_space.n
        self.dim_states = self.env.observation_space.shape
        self.NN = NN(self.env.observation_space.shape, self.env.action_space.n,
                     BATCH_SIZE, SIZE_HIDDEN, LEARNING_RATE, ACTIVATION)
        self.observers = []
        self.episode_count = 0
        self.step_count_total = 1
        self.step_count_episode = 1
        self.epsilon_min = EPSILON_MIN
        self.epsilon_max = EPSILON_MAX
        self.epsilon_decay = EPSILON_DECAY
        self.target_update = TARGET_UPDATE
        self.max_steps = MAX_STEPS
        self.n_episodes = N_EPISODES
        self.epsilon = EPSILON_MAX
        self.batch_size = BATCH_SIZE
        self.usetarget = False
        self.gamma = GAMMA
        self.loss = 0
        self.done = False
        self.reward = 0
        self.reward_episode = 0
        self.learning_switch = False
        self.learning_start = LEARNING_START

    def notify(self, event):
        for observer in self.observers:
            observer(event)
        pass

    def act(self, state):
        self.step_count_total += 1
        action = self.choose_action(state)
        return action

    def learn(self, obs):
        self.memory.store(obs)
        if self.learning_switch:
            self.backup()
        self.notify('step_done')
        pass

    def backup(self):
        self.flashback()
        if self.step_count_total % self.target_update == 0:
            print('update')
            print(self.epsilon)
            self.NN.update_target()
            self.usetarget = True
        pass

    def flashback(self):
        X, y = self._make_batch()
        self.loss = self.NN.train(X, y)
        if np.isnan(self.loss.history['loss']).any():
            print('Warning, loss is {}'.format(self.loss))
        pass

    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            choice = self.random_choice()
        else:
            choice = self.greedy_choice(state)
        return choice

    def greedy_choice(self, state):
        greedy_choice = self.NN.best_action(state, usetarget=False)
        return greedy_choice

    def random_choice(self):
        random_choice = np.random.randint(0, self.dim_actions)
        return random_choice

    def _make_batch(self):
        X = []
        y = []
        batch = self.memory.get_batch(self.batch_size)
        for state, action, newstate, reward, done in batch:
            X.append(state)
            target = self.NN.predict(state, False)
            q_vals_new_t = self.NN.predict(newstate, self.usetarget)
            a_select = self.NN.best_action(newstate, False)
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma * q_vals_new_t[a_select]
            y.append(target)
        return X, y

    def add_observer(self, observer):
        self.observers.append(observer)
        pass
示例#9
0
class Agent():
    def __init__(self, action_size):
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 500000
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)

    def load_policy_net(self, path):
        self.policy_net = torch.load(path)

    """Get action using policy net using epsilon-greedy policy"""
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            ### CODE #### 
            # Choose a random action
            return torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long)
        else:
            ### CODE ####
            # Choose the best action
            with torch.no_grad():
                  state = torch.FloatTensor(state).unsqueeze(0).cuda()  
            return self.policy_net(state).max(1)[1].view(1, 1)

    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        states = torch.from_numpy(states).cuda()
        actions = list(mini_batch[1])
        actions = torch.LongTensor(actions).cuda()
        rewards = list(mini_batch[2])
        rewards = torch.FloatTensor(rewards).cuda()
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        next_states = torch.tensor(next_states).cuda()
        dones = mini_batch[3] # checks if the game is over
        musk = torch.tensor(list(map(int, dones==False)),dtype=torch.bool)

        # Compute Q(s_t, a), the Q-value of the current state
        ### CODE ####
        state_action_values = self.policy_net(states).gather(1, actions.view(batch_size,-1))
        # Compute Q function of next state
        ### CODE ####
        next_state_values = torch.zeros(batch_size,device=device).cuda()
        non_final_mask=torch.tensor(tuple(map(lambda s: s is not None, next_states)), device=device, dtype=torch.uint8)
        non_final_next_states = torch.cat([i for i in next_states if i is not None]).view(states.size()).cuda()
        # Compute the expected Q values
        next_state_values[non_final_mask] = self.policy_net(non_final_next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values * self.discount_factor) + rewards
        # Compute the Huber Loss
        ### CODE ####
        loss = F.smooth_l1_loss(state_action_values.view(32), expected_state_action_values)
        # Optimize the model, .step() both the optimizer and the scheduler!
        ### CODE ####
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
示例#10
0
    def __init__(self,
                 config,
                 env,
                 doubleDQN=False,
                 duelingDQN=False,
                 NoisyDQN=False,
                 N_stepDQN=False,
                 Prioritized=False):
        self.device = config.device

        self.doubleDQN = doubleDQN
        self.duelingDQN = duelingDQN
        self.NoisyDQN = NoisyDQN
        self.N_stepDQN = N_stepDQN
        self.Prioritized = Prioritized

        self.gamma = config.gamma  # 折扣因子
        self.learning_rate = config.learning_rate  # 学习率
        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数
        self.priority_alpha = config.priority_alpha
        self.priority_beta_start = config.priority_beta_start
        self.priority_beta_frames = config.priority_beta_frames

        self.epsilon = config.epsilon  # epsilon初始值,以其概率选择最大值的动作
        self.epsilon_final = config.epsilon_final  # epsilon的最小值
        self.epsilon_decay = config.epsilon_decay  # epsilon衰减率

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.n  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.N_step = config.N_step  # 多步学习的步数

        self.N_step_buffer = []

        if self.Prioritized:
            self.memory = PrioritizedReplayMemory(
                self.replay_size, self.priority_alpha,
                self.priority_beta_start, self.priority_beta_frames)  # 初始化经验池
        else:
            self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        if self.duelingDQN:
            # 初始化评估网络
            self.eval_net = DuelingDQNNet(self.num_states,
                                          self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DuelingDQNNet(self.num_states,
                                            self.num_actions).to(self.device)
        elif self.NoisyDQN:
            # 初始化评估网络
            self.eval_net = NoisyNet(self.num_states,
                                     self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = NoisyNet(self.num_states,
                                       self.num_actions).to(self.device)
        else:
            self.eval_net = DQNNet(self.num_states,
                                   self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DQNNet(self.num_states,
                                     self.num_actions).to(self.device)

        # 目标网络和评估网络初始时参数一致
        self.target_net.load_state_dict(self.eval_net.state_dict())

        # 训练的优化器
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=self.learning_rate)

        # 均方损失函数
        self.loss_func = nn.MSELoss()
示例#11
0
class DQN(object):
    def __init__(self,
                 config,
                 env,
                 doubleDQN=False,
                 duelingDQN=False,
                 NoisyDQN=False,
                 N_stepDQN=False,
                 Prioritized=False):
        self.device = config.device

        self.doubleDQN = doubleDQN
        self.duelingDQN = duelingDQN
        self.NoisyDQN = NoisyDQN
        self.N_stepDQN = N_stepDQN
        self.Prioritized = Prioritized

        self.gamma = config.gamma  # 折扣因子
        self.learning_rate = config.learning_rate  # 学习率
        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数
        self.priority_alpha = config.priority_alpha
        self.priority_beta_start = config.priority_beta_start
        self.priority_beta_frames = config.priority_beta_frames

        self.epsilon = config.epsilon  # epsilon初始值,以其概率选择最大值的动作
        self.epsilon_final = config.epsilon_final  # epsilon的最小值
        self.epsilon_decay = config.epsilon_decay  # epsilon衰减率

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.n  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.N_step = config.N_step  # 多步学习的步数

        self.N_step_buffer = []

        if self.Prioritized:
            self.memory = PrioritizedReplayMemory(
                self.replay_size, self.priority_alpha,
                self.priority_beta_start, self.priority_beta_frames)  # 初始化经验池
        else:
            self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        if self.duelingDQN:
            # 初始化评估网络
            self.eval_net = DuelingDQNNet(self.num_states,
                                          self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DuelingDQNNet(self.num_states,
                                            self.num_actions).to(self.device)
        elif self.NoisyDQN:
            # 初始化评估网络
            self.eval_net = NoisyNet(self.num_states,
                                     self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = NoisyNet(self.num_states,
                                       self.num_actions).to(self.device)
        else:
            self.eval_net = DQNNet(self.num_states,
                                   self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DQNNet(self.num_states,
                                     self.num_actions).to(self.device)

        # 目标网络和评估网络初始时参数一致
        self.target_net.load_state_dict(self.eval_net.state_dict())

        # 训练的优化器
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=self.learning_rate)

        # 均方损失函数
        self.loss_func = nn.MSELoss()

    # 储存记忆
    def store_transition(self, state, action, reward, next_state, done):
        if self.N_stepDQN:
            # 把当前经验放入N_step buffer中
            self.N_step_buffer.append(
                (state, action, reward, next_state, done))

            # 如果没有达到设定的步数,return
            if len(self.N_step_buffer) < self.N_step:
                return

            # 计算N步回报
            R = sum([
                self.N_step_buffer[i][2] * (self.gamma**i)
                for i in range(self.N_step)
            ])
            state, action, _, _, _ = self.N_step_buffer.pop(0)

            self.memory.push((state, action, R, next_state, done))
        else:
            self.memory.push((state, action, reward, next_state, done))

    # 选择动作
    def choose_action(self, s):
        with torch.no_grad():
            if np.random.random(
                    1) >= self.epsilon:  # 如果大于等于epsilon,动作为网络中Q值最大的
                X = torch.tensor([s], device=self.device, dtype=torch.float)
                a = self.eval_net(X).max(1)[1].view(1, 1)  # 用eval网络计算q值
                return a.item()
            else:  # 如果小于epsilon,动作随机
                return np.random.randint(0, self.num_actions)

    # 从经验池中选取样本
    def get_batch(self):
        transitions, indices, weights = self.memory.sample(
            self.batch_size)  # 批样本

        # 解压批样本
        # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)]
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
            *transitions)

        # 将样本转化为tensor
        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.long).squeeze().view(
                                        -1, 1)  # view转换为列tensor
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(-1, 1)
        batch_next_state = torch.tensor(batch_next_state,
                                        device=self.device,
                                        dtype=torch.float)
        batch_done = torch.tensor(batch_done,
                                  device=self.device,
                                  dtype=torch.float).squeeze().view(-1, 1)
        # print("状态:", batch_state.shape) 128,4
        # print("动作:", batch_action.shape)
        # print("奖励:", batch_reward.shape)
        # print("done:", batch_done.shape)
        #
        return batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights

    # 学习
    def learn(self):
        # 更新目标网络
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())

        # 获取批样本
        batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights = self.get_batch(
        )

        # print("状态:", batch_state)
        # print("动作:", batch_action)
        # print("done:", batch_done)

        # 计算q(s,a;θ)
        if self.NoisyDQN:
            self.eval_net.sample_noise()
        q_s_a = self.eval_net(batch_state).gather(1, batch_action)
        # print("q_s_a:", q_s_a.shape)

        # 计算target yj = rj + (1 - done) * gamma * max(q(s',a;θ'))
        with torch.no_grad():
            if self.NoisyDQN:
                self.target_net.sample_noise()
            if self.doubleDQN:
                next_max_action = self.eval_net(batch_next_state).max(
                    dim=1)[1].view(-1, 1)
                q_target = batch_reward + (
                    1. - batch_done) * self.gamma * self.target_net(
                        batch_next_state).gather(1, next_max_action)
                # print("q_target:", q_target)
                # print("q_target.shape:", q_target.shape)
            else:
                next_q = self.target_net(batch_next_state)
                # print("next_q:", next_q)
                max_next_q_a = next_q.max(1)[0].view(-1, 1)
                # print("max_next_q_a:", max_next_q_a)
                # print("max_next_q_a.shape:", max_next_q_a.shape)
                q_target = batch_reward + (
                    1. - batch_done) * self.gamma * max_next_q_a
                # print("q_target:", q_target)
                # print("q_target.shape:", q_target.shape)

        # 损失函数更新
        if self.Prioritized:
            diff = (q_target - q_s_a)
            self.memory.update_priorities(
                indices,
                diff.detach().squeeze().abs().cpu().numpy().tolist())
        loss = self.loss_func(q_target, q_s_a)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 学习的步数加一
        self.learn_step_counter += 1

    # 保存模型
    def save(self):
        if self.duelingDQN:
            torch.save(self.eval_net, 'duelingDQN.pkl')
        elif self.NoisyDQN:
            torch.save(self.eval_net, 'NoisyDQN.pkl')
        elif self.N_stepDQN:
            torch.save(self.eval_net, 'N_stepDQN.pkl')
        elif self.Prioritized:
            torch.save(self.eval_net, 'PriorityReplayDQN.pkl')
        else:
            torch.save(self.eval_net, 'DQN.pkl')

    # 加载模型
    def load(self):
        if self.duelingDQN:
            self.eval_net = torch.load('duelingDQN.pkl')
        elif self.NoisyDQN:
            self.eval_net = torch.load('NoisyDQN.pkl')
        elif self.N_stepDQN:
            self.eval_net = torch.load('N_stepDQN.pkl')
        elif self.Prioritized:
            self.eval_net = torch.load('PriorityReplayDQN.pkl')
        else:
            self.eval_net = torch.load('DQN.pkl')
示例#12
0
class Agent:
    def __init__(self,
                 environment,
                 optimizer,
                 memory_length,
                 dueling=True,
                 loss='mse',
                 noisy_net=False,
                 egreedy=False,
                 save_memory=None,
                 save_weights=None,
                 verbose_action=False,
                 ):

        self.environment = environment
        self._optimizer = optimizer
        self._loss = loss
        self.dueling = dueling
        self.egreedy = egreedy
        self.noisy_net = noisy_net

        # Initialize discount and exploration rate, etc
        self.total_steps = 0
        self.gamma = 0.99
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.00005
        self.tau = 0.05
        self.pretraining_steps = 0

        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_target_model(how='hard')

        self.memory = ReplayMemory(memory_length)

        self.save_weights_fp = save_weights
        self.save_memory_fp = save_memory
        self.start_time = datetime.datetime.now()
        self.verbose_action = verbose_action

    def load_memory(self, fp):
        with open(fp, 'rb') as f:
            self.memory.load_memory(pickle.load(f))
            print(f'loading {self.memory.length} memories...')

    def save_memory(self, fp):
        if fp:
            with open(fp, 'wb') as f:
                print('saving replay memory...')
                pickle.dump(self.memory.get_memory(), f)

    def load_weights(self, weights_fp):
        if weights_fp:
            print('loading weights...')
            self.q_network.load_weights(weights_fp)
            self.align_target_model(how='hard')

    def save_weights(self, weights_fp):
        if weights_fp:
            self.q_network.save_weights(weights_fp)

    def set_epsilon_decay_schedule(self, epsilon, epsilon_min, annealed_steps):
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = math.log(self.epsilon / self.epsilon_min) / annealed_steps

    def set_beta_schedule(self, beta_start, beta_max, annealed_samplings):
        self.memory.beta = beta_start
        self.memory.beta_max = beta_max
        self.memory.beta_increment_per_sampling = (self.memory.beta_max - self.memory.beta) / annealed_samplings

    def predict(self, state, use_target=False):
        if use_target:
            return self.target_network.predict(state)
        else:
            return self.q_network.predict(state)

    def _decay_epsilon(self):
        self.epsilon = self.epsilon * np.exp(-self.epsilon_decay)

    def store(self, state, action, reward, next_state, terminated):
        self.memory.add((state, action, reward, next_state, terminated))
        self.total_steps += 1

        if not self.egreedy:
            if (self.epsilon > self.epsilon_min) and (self.memory.length > self.pretraining_steps):
                self._decay_epsilon()

    def batch_store(self, batch_load):
        batch_load[-2][2] = -0.1  # custom reward altering
        for row in batch_load:
            self.store(*row)

    def _build_compile_model(self):
        inputs = tf.keras.layers.Input(shape=(32, 290, 4))
        conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs)
        conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(conv1)
        conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(conv2)
        conv3 = tf.keras.layers.Flatten()(conv3)

        if self.noisy_net:
            advt = NoisyNetDense(256, activation='relu')(conv3)
            final = NoisyNetDense(2)(advt)
        else:
            advt = tf.keras.layers.Dense(256, activation='relu')(conv3)
            final = tf.keras.layers.Dense(2)(advt)

        if self.dueling:
            if self.noisy_net:
                value = NoisyNetDense(256, activation='relu')(conv3)
                value = NoisyNetDense(1)(value)
            else:
                value = tf.keras.layers.Dense(256, activation='relu')(conv3)
                value = tf.keras.layers.Dense(1)(value)

            advt = tf.keras.layers.Lambda(lambda x: x - tf.reduce_mean(x, axis=1, keepdims=True))(final)
            final = tf.keras.layers.Add()([value, advt])

        model = tf.keras.models.Model(inputs=inputs, outputs=final)
        model.compile(optimizer=self._optimizer,
                      loss=self._loss,
                      metrics=['accuracy'])
        return model

    def align_target_model(self, how):
        assert how in ('hard', 'soft'), '"how" must be either "hard" or "soft"'

        if how == 'hard':
            self.target_network.set_weights(self.q_network.get_weights())

        elif how == 'soft':
            for t, e in zip(self.target_network.trainable_variables, self.q_network.trainable_variables):
                t.assign(t * (1 - self.tau) + (e * self.tau))

    def choose_action(self, state):
        if not self.egreedy:
            if np.random.rand() <= self.epsilon:
                action = self.environment.action_space.sample()
                if self.verbose_action:
                    print(f'action: {action}, q: random')
                return action

        q_values = self.predict(state, use_target=False)
        action = np.argmax(q_values[0])
        if self.verbose_action:
            print(f'action: {action}, q: {q_values}')
        return action

    def train(self, batch, is_weights):

        td_errors = np.zeros(len(batch))
        states = np.zeros((len(batch), 32, 290, 4))
        targets = np.zeros((len(batch), 2))

        for i, (state, action, reward, next_state, terminated) in enumerate(batch):
            target, td_error = self._get_target(state, action, reward, next_state, terminated)
            states[i] = state.reshape(32, 290, 4)
            targets[i] = target
            td_errors[i] = td_error

        self.q_network.fit(states, targets, sample_weight=is_weights, batch_size=32, epochs=1, verbose=0)
        self.align_target_model(how='soft')

        return td_errors

    def replay(self, batch_size, epoch_steps=None):

        num_batches = 1
        if epoch_steps:
            num_batches = int(np.max([np.floor(epoch_steps / 4), 1]))

        bar = progressbar.ProgressBar(maxval=num_batches,
                                      widgets=[f'training - ', progressbar.widgets.Counter(), f'/{num_batches} ',
                                               progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        bar.start()

        for i in range(num_batches):
            leaf_idx, batch, is_weights = self.memory.get_batch(batch_size)  # prioritized experience replay
            td_errors = self.train(batch, is_weights)
            self.memory.update_sum_tree(leaf_idx, td_errors)

            bar.update(i + 1)

        bar.finish()
        self.save_weights(self.save_weights_fp)

    def _get_target(self, state, action, reward, next_state, terminated):
        target = self.predict(state, use_target=False)
        prev_target = target[0][action]

        if terminated:
            target[0][action] = reward
        else:
            a = np.argmax(self.predict(next_state, use_target=False)[0])
            target[0][action] = reward + (self.gamma * self.predict(next_state, use_target=True)[0][a])  # double Q Network

        td_error = abs(prev_target - target[0][action])

        return target, td_error
示例#13
0
class DDPG:
    def __init__(self,
                 env,
                 actor_model,
                 critic_model,
                 memory=10000,
                 batch_size=64,
                 gamma=0.99,
                 tau=0.001,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 critic_decay=1e-2,
                 ou_theta=0.15,
                 ou_sigma=0.2,
                 render=None,
                 evaluate=None,
                 save_path=None,
                 save_every=10,
                 render_every=10,
                 train_per_step=True):
        self.env = env
        self.actor = actor_model
        self.actor_target = actor_model.clone()
        self.critic = critic_model
        self.critic_target = critic_model.clone()
        if use_cuda:
            for net in [
                    self.actor, self.actor_target, self.critic,
                    self.critic_target
            ]:
                net.cuda()
        self.memory = ReplayMemory(memory)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.random_process = OrnsteinUhlenbeckProcess(
            env.action_space.shape[0], theta=ou_theta, sigma=ou_sigma)
        self.optim_critic = optim.Adam(self.critic.parameters(),
                                       lr=critic_lr,
                                       weight_decay=critic_decay)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.render = render
        self.render_every = render_every
        self.evaluate = evaluate
        self.save_path = save_path
        self.save_every = save_every
        self.train_per_step = train_per_step

    def update(self, target, source):
        zipped = zip(target.parameters(), source.parameters())
        for target_param, source_param in zipped:
            updated_param = target_param.data * (1 - self.tau) + \
                source_param.data * self.tau
            target_param.data.copy_(updated_param)

    def train_models(self):
        if len(self.memory) < self.batch_size:
            return None, None
        mini_batch = self.memory.sample_batch(self.batch_size)
        critic_loss = self.train_critic(mini_batch)
        actor_loss = self.train_actor(mini_batch)
        self.update(self.actor_target, self.actor)
        self.update(self.critic_target, self.critic)
        return critic_loss.data[0], actor_loss.data[0]

    def mse(self, inputs, targets):
        return torch.mean((inputs - targets)**2)

    def train_critic(self, batch):
        # forward pass
        pred_actions = self.actor_target(batch.next_states)
        target_q = batch.rewards + batch.done * self.critic_target(
            [batch.next_states, pred_actions]) * self.gamma
        pred_q = self.critic([batch.states, batch.actions])
        # backward pass
        loss = self.mse(pred_q, target_q)
        self.optim_critic.zero_grad()
        loss.backward(retain_graph=True)
        for param in self.critic.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim_critic.step()
        return loss

    def train_actor(self, batch):
        # forward pass
        pred_mu = self.actor(batch.states)
        pred_q = self.critic([batch.states, pred_mu])
        # backward pass
        loss = -pred_q.mean()
        self.optim_actor.zero_grad()
        loss.backward()
        #         for param in self.actor.parameters():
        #             param.grad.data.clamp_(-1, 1)
        self.optim_actor.step()
        return loss

    def prep_state(self, s):
        return Variable(torch.from_numpy(s).float().unsqueeze(0))

    def select_action(self, state, exploration=True):
        if use_cuda:
            state = state.cuda()
        self.actor.eval()
        action = self.actor(state)
        self.actor.train()
        if exploration:
            noise = Variable(
                torch.from_numpy(self.random_process.sample()).float())
            if use_cuda:
                noise = noise.cuda()
            action = action + noise
        return action

    def step(self, action):
        next_state, reward, done, _ = self.env.step(
            action.data.cpu().numpy()[0])
        next_state = self.prep_state(next_state)
        reward = FloatTensor([reward])
        return next_state, reward, done

    def warmup(self, num_steps):
        overall_step = 0
        while overall_step <= num_steps:
            done = False
            state = self.prep_state(self.env.reset())
            self.random_process.reset()
            while not done:
                overall_step += 1
                action = self.select_action(state)
                next_state, reward, done = self.step(action)
                self.memory.add(state, action, reward, next_state, done)
                state = next_state

    def train(self, num_steps):
        running_reward = None
        reward_sums = []
        losses = []
        overall_step = 0
        episode_number = 0

        while overall_step <= num_steps:
            episode_number += 1
            done = False
            state = self.prep_state(self.env.reset())
            reward_sum = 0
            self.random_process.reset()

            while not done:
                overall_step += 1
                action = self.select_action(state)
                next_state, reward, done = self.step(action)
                self.memory.add(state, action, reward, next_state, done)
                state = next_state
                reward_sum += reward[0]
                if self.train_per_step:
                    losses.append(self.train_models())
            if not self.train_per_step:
                losses.append(self.train_models())

            render_this_episode = self.render and (episode_number %
                                                   self.render_every == 0)
            evaluation_reward = self.run(render=render_this_episode)
            reward_sums.append((reward_sum, evaluation_reward))

            if self.save_path is not None and (episode_number % self.save_every
                                               == 0):
                self.save_models(self.save_path)
                self.save_results(self.save_path, losses, reward_sums)

            running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
            print(
                'episode: {}  steps: {}  running train reward: {:.4f}  eval reward: {:.4f}'
                .format(episode_number, overall_step, running_reward,
                        evaluation_reward))

        if self.save_path is not None:
            self.save_models(self.save_path)
            self.save_results(self.save_path, losses, reward_sums)
        return reward_sums, losses

    def run(self, render=True):
        state = self.env.reset()
        done = False
        reward_sum = 0
        while not done:
            if render:
                self.env.render()
            action = self.select_action(self.prep_state(state),
                                        exploration=False)
            state, reward, done, _ = self.env.step(
                action.data.cpu().numpy()[0])
            reward_sum += reward
        return reward_sum

    def save_models(self, path):
        self.actor.save(path)
        self.critic.save(path)

    def save_results(self, path, losses, rewards):
        losses = np.array([l for l in losses if l[0] is not None])
        rewards = np.array(rewards)
        np.savetxt(os.path.join(path, 'losses.csv'),
                   losses,
                   delimiter=',',
                   header='critic,actor',
                   comments='')
        np.savetxt(os.path.join(path, 'rewards.csv'),
                   rewards,
                   delimiter=',',
                   header='train,evaluation',
                   comments='')
示例#14
0
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed=399,
                 memory_size=int(1e6),
                 batch_size=128,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-4,
                 weight_decay=0.0,
                 actor_units=(256, 128),
                 critic_units=(256, 128),
                 action_range=None):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            memory_size (int): The total amount of memory to save experiences
            batch_size (int): subset size for each training step
            gamma (float): discount factor
            tau (float): interpolation parameter
            lr_actor (float): learning rate for actor model
            lr_critic (float): learning rate for critic model
            weight_decay (float): L2 weight decay
            actor_units (tuple): A tuple with numbers of nodes in 1st and 2nd hidden layer for actor network
            critic_units (tuple): A tuple with numbers of nodes in 1st and 2nd hidden layer for critic network
            action_min (int or float): The min value in the action range
            action_max (int or float): The max value in the action range
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.random_seed = random_seed
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.weight_decay = weight_decay
        self.actor_units = actor_units
        self.critic_units = critic_units

        # action range
        if isinstance(action_range, tuple) or action_range == None:
            self.action_range = action_range
        else:
            raise ValueError(
                "action_range needs to be a tuple with two elements or None.")

        # Actor Network (w/ Target Network)
        if Agent.actor_local is None:
            Agent.actor_local = Actor(self.state_size,
                                      self.action_size,
                                      self.random_seed,
                                      fc1_units=self.actor_units[0],
                                      fc2_units=self.actor_units[1]).to(device)
        if Agent.actor_target is None:
            Agent.actor_target = Actor(
                self.state_size,
                self.action_size,
                self.random_seed,
                fc1_units=self.actor_units[0],
                fc2_units=self.actor_units[1]).to(device)
        if Agent.actor_optimizer is None:
            Agent.actor_optimizer = optim.Adam(Agent.actor_local.parameters(),
                                               lr=self.lr_actor)

        self.actor_local = Agent.actor_local
        self.actor_target = Agent.actor_target
        self.actor_optimizer = Agent.actor_optimizer

        # Critic Network (w/ Target Network)
        if Agent.critic_local is None:
            Agent.critic_local = Critic(
                self.state_size,
                self.action_size,
                self.random_seed,
                fc1_units=self.critic_units[0],
                fc2_units=self.critic_units[1]).to(device)
        if Agent.critic_target is None:
            Agent.critic_target = Critic(
                self.state_size,
                self.action_size,
                self.random_seed,
                fc1_units=self.critic_units[0],
                fc2_units=self.critic_units[1]).to(device)
        if Agent.critic_optimizer is None:
            Agent.critic_optimizer = optim.Adam(
                Agent.critic_local.parameters(),
                lr=self.lr_critic,
                weight_decay=self.weight_decay)

        self.critic_local = Agent.critic_local
        self.critic_target = Agent.critic_target
        self.critic_optimizer = Agent.critic_optimizer

        # Noise process
        self.noise = OUNoise(self.action_size, self.random_seed)

        # Define memory
        if Agent.memory is None:
            Agent.memory = ReplayMemory(self.memory_size, self.batch_size,
                                        self.random_seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self._time_step = 0
示例#15
0
文件: train.py 项目: tevfikoguz/cule
def worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632',
                                             world_size=args.world_size, rank=args.rank)
    else:
        args.rank = 0

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = not torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu')

    # Setup
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(random.randint(1, 10000))

    if train_device.type == 'cuda':
        print('Train:\n' + cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes,
                                              episode_life=False, clip_rewards=False)
        test_env.reset()
    else:
        test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray',
                            device='cpu', rescale=True, clip_rewards=False,
                            episodic_life=False, repeat_prob=0.0, frameskip=4)

    # Agent
    dqn = Agent(args, test_env.action_space)

    # Construct validation memory
    if args.rank == 0:
        print(dqn)
        print('Initializing evaluation memory with {} entries...'.format(args.evaluation_size), end='', flush=True)
        start_time = time.time()

    val_mem = initialize_validation(args, train_device)

    if args.rank == 0:
        print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

    if args.evaluate:
        if args.rank == 0:
            eval_start_time = time.time()
            dqn.eval()  # Set DQN (online network) to evaluation mode
            rewards, lengths, avg_Q = test(args, 0, dqn, val_mem, test_env, train_device)
            dqn.train()  # Set DQN (online network) back to training mode
            eval_total_time = time.time() - eval_start_time

            rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
            lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

            print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'Avg. Q: {:4.4f} | {}'
                  .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                          lstd, avg_Q, format_time(eval_total_time)),
                  flush=True)
    else:
        if args.rank == 0:
            print('Entering main training loop', flush=True)

            if args.output_filename:
                csv_file = open(args.output_filename, 'w', newline='')
                csv_file.write(json.dumps(vars(args)))
                csv_file.write('\n')
                csv_writer = csv.writer(csv_file, delimiter=',')
                csv_writer.writerow(['frames', 'total_time',
                                     'rmean', 'rmedian', 'rstd', 'rmin', 'rmax',
                                     'lmean', 'lmedian', 'lstd', 'lmin', 'lmax'])
            else:
                csv_writer, csv_file = None, None

            if args.plot:
                from tensorboardX import SummaryWriter
                current_time = datetime.now().strftime('%b%d_%H-%M-%S')
                log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname())
                writer = SummaryWriter(log_dir=log_dir)
                for k, v in vars(args).items():
                    writer.add_text(k, str(v))

            # Environment
            print('Initializing environments...', end='', flush=True)
            start_time = time.time()

        if args.use_openai:
            train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales,
                                                   episode_life=True, clip_rewards=args.reward_clip,
                                                   max_frames=args.max_episode_length)
            observation = torch.from_numpy(train_env.reset()).squeeze(1)
        else:
            train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray',
                                 device=env_device, rescale=True,
                                 clip_rewards=args.reward_clip,
                                 episodic_life=True, repeat_prob=0.0)
            train_env.train()
            observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).clone().squeeze(-1)

        if args.rank == 0:
            print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

        # These variables are used to compute average rewards for all processes.
        episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        has_completed = torch.zeros(args.num_ales, device=train_device, dtype=torch.bool)

        mem = ReplayMemory(args, args.memory_capacity, train_device)
        mem.reset(observation)
        priority_weight_increase = (1 - args.priority_weight) / (args.t_max - args.learn_start)

        state = torch.zeros((args.num_ales, args.history_length, 84, 84), device=mem.device, dtype=torch.float32)
        state[:, -1] = observation.to(device=mem.device, dtype=torch.float32).div(255.0)

        num_frames_per_iter = args.num_ales
        total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter))
        epsilons = np.linspace(args.epsilon_start, args.epsilon_final, math.ceil(args.epsilon_frames / num_frames_per_iter))
        epsilon_offset = math.ceil(args.learn_start / num_frames_per_iter)

        prefetcher = data_prefetcher(args.batch_size, train_device, mem)

        avg_loss = 'N/A'
        eval_offset = 0
        target_update_offset = 0

        total_time = 0

        # main loop
        iterator = range(total_steps)
        if args.rank == 0:
            iterator = tqdm(iterator)

        env_stream = torch.cuda.Stream()
        train_stream = torch.cuda.Stream()

        for update in iterator:

            T = args.world_size * update * num_frames_per_iter
            epsilon = epsilons[min(update - epsilon_offset, len(epsilons) - 1)] if T >= args.learn_start else epsilons[0]
            start_time = time.time()

            if update % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            dqn.eval()
            nvtx.range_push('train:select action')
            if args.noisy_linear:
                action = dqn.act(state)  # Choose an action greedily (with noisy weights)
            else:
                action = dqn.act_e_greedy(state, epsilon=epsilon)
            nvtx.range_pop()
            dqn.train()

            if args.use_openai:
                action = action.cpu().numpy()

            torch.cuda.synchronize()

            with torch.cuda.stream(env_stream):
                nvtx.range_push('train:env step')
                observation, reward, done, info = train_env.step(action)  # Step

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation).squeeze(1)
                    reward = torch.from_numpy(reward.astype(np.float32))
                    done = torch.from_numpy(done.astype(np.bool))
                    action = torch.from_numpy(action)
                else:
                    observation = observation.clone().squeeze(-1)
                nvtx.range_pop()

                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device)
                done = done.to(device=train_device, dtype=torch.bool)
                action = action.to(device=train_device)

                observation = observation.float().div_(255.0)
                not_done = 1.0 - done.float()

                state[:, :-1].copy_(state[:, 1:].clone())
                state *= not_done.view(-1, 1, 1, 1)
                state[:, -1].copy_(observation)

                # update episodic reward counters
                has_completed |= done

                episode_rewards += reward.float()
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done

            # Train and test
            if T >= args.learn_start:
                mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)  # Anneal importance sampling weight β to 1
                prefetcher.preload()

                avg_loss = 0.0
                num_minibatches = min(int(args.num_ales / args.replay_frequency), 8)
                for _ in range(num_minibatches):
                    # Sample transitions
                    nvtx.range_push('train:sample states')
                    idxs, states, actions, returns, next_states, nonterminals, weights = prefetcher.next()
                    nvtx.range_pop()

                    nvtx.range_push('train:network update')
                    loss = dqn.learn(states, actions, returns, next_states, nonterminals, weights)
                    nvtx.range_pop()

                    nvtx.range_push('train:update priorities')
                    mem.update_priorities(idxs, loss)  # Update priorities of sampled transitions
                    nvtx.range_pop()

                    avg_loss += loss.mean().item()
                avg_loss /= num_minibatches

                # Update target network
                if T >= target_update_offset:
                    dqn.update_target_net()
                    target_update_offset += args.target_update

            torch.cuda.current_stream().wait_stream(env_stream)
            torch.cuda.current_stream().wait_stream(train_stream)

            nvtx.range_push('train:append memory')
            mem.append(observation, action, reward, done)  # Append transition to memory
            nvtx.range_pop()

            total_time += time.time() - start_time

            if args.rank == 0:
                if args.plot and ((update % args.replay_frequency) == 0):
                    writer.add_scalar('train/epsilon', epsilon, T)
                    writer.add_scalar('train/rewards', final_rewards.mean(), T)
                    writer.add_scalar('train/lengths', final_lengths.mean(), T)

                if T >= eval_offset:
                    eval_start_time = time.time()
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    rewards, lengths, avg_Q = test(args, T, dqn, val_mem, test_env, train_device)
                    dqn.train()  # Set DQN (online network) back to training mode
                    eval_total_time = time.time() - eval_start_time
                    eval_offset += args.evaluation_interval

                    rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
                    lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

                    print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'Avg. Q: {:4.4f} | {}'
                          .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                                  lstd, avg_Q, format_time(eval_total_time)),
                          flush=True)

                    if args.output_filename and csv_writer and csv_file:
                        csv_writer.writerow([T, total_time,
                                             rmean, rmedian, rstd, rmin, rmax,
                                             lmean, lmedian, lstd, lmin, lmax])
                        csv_file.flush()

                    if args.plot:
                        writer.add_scalar('eval/rewards', rmean, T)
                        writer.add_scalar('eval/lengths', lmean, T)
                        writer.add_scalar('eval/avg_Q', avg_Q, T)

                loss_str = '{:4.4f}'.format(avg_loss) if isinstance(avg_loss, float) else avg_loss
                progress_data = 'T = {:,} epsilon = {:4.2f} avg reward = {:4.2f} loss: {}' \
                                .format(T, epsilon, final_rewards.mean().item(), loss_str)
                iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
        test_env.close()
    returns_np = np.asarray(returns)
    fsv = np.asarray(first_state_values)
    lsv = np.asarray(last_state_values)
    plt.title('Training...Returns')
    plt.xlabel('Frames')
    plt.ylabel('Return')
    plt.plot(ep_durations, returns)
    plt.plot(ep_durations, fsv, 'C1')
    plt.plot(ep_durations, lsv, 'C2')
    plt.pause(0.002)


#init agent, memory and environment
agent = Agent(N_ACTIONS, EPS_START, EPS_END, EPS_STEPS, GAMMA, TRAIN, use_cuda,
              BATCH_SIZE, 'CP')
memory = ReplayMemory(RM_CAPACITY)

env = gym.make(ENV)

ep_durations = [0]  #used for ploting
returns = [0]
last_state_values = [0]
first_state_values = [0]

for i_episode in range(INIT_RM):
    if not TRAIN:
        break
    cur_state = env.reset()
    while True:
        action = agent.take_action(FloatTensor([cur_state]))
        next_state, reward, done, _ = env.step(env.action_space.sample())
示例#17
0
class MADDPG:
    def __init__(self, n_agents, dim_obs, dim_act, batch_size,
                 capacity, episodes_before_train, load_models=None):
        # self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)]
        # self.critics = [Critic(n_agents, dim_obs, dim_act) for i in range(n_agents)]

        if load_models is None:
            self.models = Models(n_agents, dim_obs, dim_act)
            self.actors_target = deepcopy(self.models.actors)
            self.critics_target = deepcopy(self.models.critics)
            self.critic_optimizer = [Adam(x.parameters(), lr=0.0001) for x in self.models.critics]  # 0.001
            self.actor_optimizer = [Adam(x.parameters(), lr=0.00001) for x in self.models.actors]   # 0.0001
            self.memory = ReplayMemory(capacity)
            self.var = [1.0 for i in range(n_agents)]
        else:
            print('Start loading models!')
            states = th.load(load_models)
            self.models = states['models']
            self.critic_optimizer = states['critic_optimizer']
            self.actor_optimizer = states['actor_optimizer']
            self.critics_target = states['critics_target']
            self.actors_target = states['actors_target']
            self.memory = states['memory']
            self.var = states['var']
            print('Models loaded!')

        self.n_agents = n_agents
        self.n_states = dim_obs
        self.n_actions = dim_act
        self.batch_size = batch_size
        self.use_cuda = th.cuda.is_available()
        self.episodes_before_train = episodes_before_train

        self.GAMMA = 0.95
        self.tau = 0.01

        if self.use_cuda:
            for x in self.models.actors:
                x.cuda()
            for x in self.models.critics:
                x.cuda()
            for x in self.actors_target:
                x.cuda()
            for x in self.critics_target:
                x.cuda()

        self.steps_done = 0
        self.episode_done = 0

    def update_policy(self):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor

        c_loss = []
        a_loss = []

        critics_grad = []
        actors_grad = []
        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))
            non_final_mask = ByteTensor(list(map(lambda s: s is not None,
                                                 batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = Variable(th.stack(batch.states).type(FloatTensor))
            action_batch = Variable(th.stack(batch.actions).type(FloatTensor))
            reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor))
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = Variable(th.stack(
                [s for s in batch.next_states if s is not None]).type(FloatTensor))

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)

            # critic network
            self.critic_optimizer[agent].zero_grad()
            current_Q = self.models.critics[agent](whole_state, whole_action)   # forward?

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents)]
            non_final_next_actions = th.stack(non_final_next_actions)
#            non_final_next_actions = Variable(non_final_next_actions)
            non_final_next_actions = (
                non_final_next_actions.transpose(0, 1).contiguous())

            target_Q = Variable(th.zeros(self.batch_size).type(FloatTensor))
            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.n_states),
                non_final_next_actions.view(-1, self.n_agents * self.n_actions))

            # scale_reward: to scale reward in Q functions
            target_Q = (target_Q * self.GAMMA) + (reward_batch[:, agent] * scale_reward)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()

            self.critic_optimizer[agent].step()

            # actor network
            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.models.actors[agent](state_i)   # forward
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.models.critics[agent](whole_state, whole_action)     # forward
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()

            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

            # for test
            '''
            s = 0
            for x in self.models.critics[agent].parameters():
                s += 1
                print('s: ', s)
                print(type(x))
                print('x.grad.shape: ', x.grad.size())
                print('x.data.shape: ', x.data.size())
            '''
            critics_agent_grad = []
            actors_agent_grad = []
            for x in self.models.critics[agent].parameters():
                critics_agent_grad.append(x.grad.data.norm(2))
                # critics_agent_grad.append(th.mean(x.grad).data[0])
            for x in self.models.actors[agent].parameters():
                actors_agent_grad.append(x.grad.data.norm(2))
                # actors_agent_grad.append(th.mean(x.grad).data[0])

            critics_grad.append(critics_agent_grad)
            actors_grad.append(actors_agent_grad)


        if self.steps_done % 100 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.models.critics[i], self.tau)
                soft_update(self.actors_target[i], self.models.actors[i], self.tau)

        '''
        # gradient clipping
        if self.clip is not None:
            nn.utils.clip_grad_norm(self.model.parameters(), self.clip)
        '''

        # return c_loss, a_loss  #, critics_grad, actors_grad
        return critics_grad, actors_grad

    def select_action(self, state_batch):
        # state_batch: n_agents x state_dim
        actions = Variable(th.zeros(
            self.n_agents,
            self.n_actions))
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
        for i in range(self.n_agents):
            sb = state_batch[i, :].detach()
            act = self.models.actors[i](sb.unsqueeze(0)).squeeze()

            act += Variable(th.from_numpy(np.random.randn(2) * self.var[i]).type(FloatTensor))

            if self.episode_done > self.episodes_before_train and self.var[i] > 0.05:   # and self.episode_done % 100 == 0
                self.var[i] *= 0.999998

            act = th.clamp(act, -1.0, 1.0)

            actions[i, :] = act
        self.steps_done += 1
        # print('steps_done: ', self.steps_done)
        # print('episode_done: ', self.episode_done)

        return actions
示例#18
0
class Agent:
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 input_dims,
                 batch_size,
                 n_actions,
                 combined=False,
                 max_mem_size=100000,
                 eps_end=0.05,
                 eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.batch_size = batch_size
        self.memory = ReplayMemory(input_dims, max_mem_size, batch_size,
                                   combined)
        self.iter_cntr = 0
        self.replace_target = 100

        self.Q_eval = DeepQNetwork(lr,
                                   n_actions=n_actions,
                                   input_dims=input_dims,
                                   fc1_dims=256,
                                   fc2_dims=256)
        self.Q_next = DeepQNetwork(lr,
                                   n_actions=n_actions,
                                   input_dims=input_dims,
                                   fc1_dims=256,
                                   fc2_dims=256)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if not self.memory.is_sufficient():
            return

        self.Q_eval.optimizer.zero_grad()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        states, actions, rewards, new_states, dones = \
            self.memory.sample_memory()
        states = T.tensor(states).to(self.Q_eval.device)
        new_states = T.tensor(new_states).to(self.Q_eval.device)
        rewards = T.tensor(rewards).to(self.Q_eval.device)
        dones = T.tensor(dones).to(self.Q_eval.device)
        q_eval = self.Q_eval.forward(states)[batch_index, actions]
        q_next = self.Q_eval.forward(new_states)
        q_next[dones] = 0.0
        q_target = rewards + self.gamma * T.max(q_next, dim=1)[0]

        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        self.iter_cntr += 1
        self.epsilon = self.epsilon - self.eps_dec \
            if self.epsilon > self.eps_min else self.eps_min

        if self.iter_cntr % self.replace_target == 0:
            self.Q_next.load_state_dict(self.Q_eval.state_dict())
示例#19
0
def main():
    args = get_args()
    args.critic_layers = literal_eval(args.critic_layers)
    args.actor_layers = literal_eval(args.actor_layers)

    # create save directory
    save_dir = os.path.join('weights', args.exp_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    else:
        shutil.move(save_dir, save_dir + '.backup')
        os.makedirs(save_dir)

    state_transform = NormState(args.prosthetic)
    # state_transform = StateVelCentr(obstacles_mode='standard',
    #                                 exclude_centr=True,
    #                                 vel_states=[])
    env = RunEnv2(state_transform,
                  integrator_accuracy=args.accuracy,
                  model=args.modeldim,
                  prosthetic=args.prosthetic,
                  difficulty=args.difficulty,
                  skip_frame=1)
    env.change_model(args.modeldim, args.prosthetic, args.difficulty)
    num_actions = env.get_action_space_size()
    del env

    print('building model')
    # build model
    model_params = {
        'state_size': state_transform.state_size,
        'num_act': num_actions,
        'gamma': args.gamma,
        'actor_layers': args.actor_layers,
        'critic_layers': args.critic_layers,
        'actor_lr': args.actor_lr,
        'critic_lr': args.critic_lr,
        'layer_norm': args.layer_norm
    }
    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \
        build_model(**model_params)
    actor = Agent(actor_fn, params_actor, params_crit)

    if args.weights is not None:
        actor.load(args.weights)

    actor_lr_step = (args.actor_lr - args.actor_lr_end) / args.max_steps
    critic_lr_step = (args.critic_lr - args.critic_lr_end) / args.max_steps

    # build actor
    weights = [p.get_value() for p in params_actor]

    # build replay memory
    memory = ReplayMemory(state_transform.state_size, num_actions, 5000000)

    # init shared variables
    global_step = Value('i', 0)
    updates = Value('i', 0)
    best_reward = Value('f', -1e8)
    testing = Value('i', 0)

    # init agents
    data_queue = Queue()
    workers = []
    weights_queues = []
    if not args.test:
        num_agents = args.n_threads - 2
        print('starting {} agents'.format(num_agents))
    else:
        num_agents = 1
        print('starting testing agent')
    for i in range(num_agents):
        w_queue = Queue()
        worker = Process(target=run_agent,
                         args=(args, model_params, weights, state_transform,
                               data_queue, w_queue, i, global_step, updates,
                               best_reward, args.param_noise_prob, save_dir,
                               args.max_steps))
        worker.daemon = True
        worker.start()
        sleep(args.sleep)
        workers.append(worker)
        weights_queues.append(w_queue)

    if not args.test:
        print('starting training')
    else:
        print('starting testing')
    prev_steps = 0
    start_save = time()
    start_test = time()
    weights_rew_to_check = []
    while global_step.value < args.max_steps:

        # get all data
        try:
            i, batch, weights_check, reward = data_queue.get_nowait()
            if weights_check is not None:
                weights_rew_to_check.append((weights_check, reward))
            weights_queues[i].put(weights)
            # add data to memory
            memory.add_samples(*batch)
        except queue.Empty:
            pass

        # training step
        # TODO: consider not training during testing model
        if not args.test:
            if len(memory) > args.start_train_steps:
                batch = memory.random_batch(args.batch_size)

                if np.random.rand() < args.flip_prob:
                    states, actions, rewards, terminals, next_states = batch

                    states_flip = state_transform.flip_states(states)
                    next_states_flip = state_transform.flip_states(next_states)
                    actions_flip = np.zeros_like(actions)
                    actions_flip[:, :num_actions //
                                 2] = actions[:, num_actions // 2:]
                    actions_flip[:, num_actions //
                                 2:] = actions[:, :num_actions // 2]

                    states_all = np.concatenate((states, states_flip))
                    actions_all = np.concatenate((actions, actions_flip))
                    rewards_all = np.tile(rewards.ravel(), 2).reshape(-1, 1)
                    terminals_all = np.tile(terminals.ravel(),
                                            2).reshape(-1, 1)
                    next_states_all = np.concatenate(
                        (next_states, next_states_flip))
                    batch = (states_all, actions_all, rewards_all,
                             terminals_all, next_states_all)

                actor_loss, critic_loss = train_fn(*batch)
                updates.value += 1
                if np.isnan(actor_loss):
                    raise Value('actor loss is nan')
                if np.isnan(critic_loss):
                    raise Value('critic loss is nan')
                target_update_fn()
                weights = actor.get_actor_weights()

        delta_steps = global_step.value - prev_steps
        prev_steps += delta_steps

        actor_lr.set_value(
            lasagne.utils.floatX(
                max(actor_lr.get_value() - delta_steps * actor_lr_step,
                    args.actor_lr_end)))
        critic_lr.set_value(
            lasagne.utils.floatX(
                max(critic_lr.get_value() - delta_steps * critic_lr_step,
                    args.critic_lr_end)))

        # check if need to save and test
        if (time() - start_save) / 60. > args.save_period_min:
            fname = os.path.join(
                save_dir, 'weights_updates_{}.pkl'.format(updates.value))
            actor.save(fname)
            start_save = time()

        # start new test process
        weights_rew_to_check = [(w, r) for w, r in weights_rew_to_check
                                if r > best_reward.value and r > 0]
        weights_rew_to_check = sorted(weights_rew_to_check, key=lambda x: x[1])
        if ((time() - start_test) / 60. > args.test_period_min
                or len(weights_rew_to_check) > 0) and testing.value == 0:
            testing.value = 1
            print('start test')
            if len(weights_rew_to_check) > 0:
                _weights, _ = weights_rew_to_check.pop()
            else:
                _weights = weights
            worker = Process(target=test_agent,
                             args=(args, testing, state_transform,
                                   args.num_test_episodes, model_params,
                                   _weights, best_reward, updates, global_step,
                                   save_dir))
            worker.daemon = True
            worker.start()
            start_test = time()

    print('training finished')
    # end all processes
    for w in workers:
        w.join()
示例#20
0
class Agent:
    def __init__(self,
                 env,
                 exploration_rate=1,
                 exploration_decay=0.9999,
                 explore=True):
        self.action_space = env.action_space.n
        self.memory = ReplayMemory(MEMORY_SIZE)
        self.memory.fill_memory(env)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)
        self.dqn = DQN(4, self.action_space).float().to(self.device)
        self.env = env
        self.episode_rewards = []
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.explore = explore
        self.model_optim = optim.Adam(self.dqn.parameters(), lr=1e-4)
        self.episodes = 0

    def get_action(self, obs):
        if self.exploration_rate > random.random() and self.explore:
            action = random.randint(0, self.action_space - 1)
        else:
            obs = torch.tensor(obs, device=self.device).reshape(1, 4, 80,
                                                                80).float()
            action = self.dqn(obs).argmax().tolist()
        return action

    def train(self, num_episodes):
        num_steps = 0
        running_loss = 0
        loss = nn.MSELoss()

        episode_rewards = []
        for episode in tqdm(range(num_episodes)):
            obs = rgb2gray(self.env.reset()).reshape(1, 80, 80)
            for i in range(3):
                obs = np.append(obs, rgb2gray(self.env.step(0)[0]), 0)

            terminal = False
            episode_reward = 0
            while not terminal:
                action = self.get_action(obs)
                result = self.env.step(action)

                terminal = result[2]
                new_obs = np.append(obs[1:], rgb2gray(result[0]), 0)
                reward = result[1]
                if reward > 0:
                    print(episode, reward)
                episode_reward += reward

                self.memory.push(obs, action, new_obs, reward, terminal)
                batch = self.memory.sample(BATCH_SIZE)
                observations, y = self.process_batch(batch)
                num_steps += 1

                outputs = self.dqn(observations)
                episode_loss = loss(outputs, y)
                self.model_optim.zero_grad()
                episode_loss.backward()
                self.model_optim.step()
                running_loss += episode_loss.item()

                if num_steps % 1000 == 0:  # print every 2000 mini-batches
                    print(num_steps)

            episode_rewards.append(episode_reward)
            if self.exploration_rate > 0.1:
                self.exploration_rate *= self.exploration_decay
        self.episodes += num_episodes
        self.save(str(self.episodes) + '_model')
        self.episode_rewards += episode_rewards
        np.save(str(self.episodes) + '_rewards', self.episode_rewards)
        return episode_rewards

    def process_batch(self, batch):
        observations = [batch[i][0] for i in range(len(batch))]
        observations = torch.tensor(np.array(observations)).reshape(
            (BATCH_SIZE, 4, 80, 80)).float().to(self.device)

        next_observations = [batch[i][2] for i in range(len(batch))]
        next_observations = torch.tensor(np.array(next_observations)).reshape(
            (BATCH_SIZE, 4, 80, 80)).float().to(self.device)

        maxs = self.dqn(next_observations)
        maxs = maxs.max(1).values.float().to(self.device)

        rewards = [batch[i][3] for i in range(len(batch))]
        rewards = torch.tensor(rewards).float().to(self.device)

        terminals = [~batch[i][4] for i in range(len(batch))]
        terminals = torch.tensor(terminals).float().to(self.device)

        maxs = -maxs * terminals

        y = self.dqn(observations)
        Qs = rewards + GAMMA * maxs

        for i in range(len(batch)):
            y[i, batch[i][1]] = Qs[i]

        return observations, y

    def load_dqn(self, path):
        self.dqn = torch.load(path)

    def save(self, path):
        torch.save(self.dqn, path)
示例#21
0
############################ Memory ######################
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))
##########################################################

################## Declare networks ######################
screen_height, screen_width = 84, 96

policy_net = DQN(screen_height, screen_width, nb_actions).to(device)
# policy_net.load_state_dict(torch.load('./models/model'))
target_net = DQN(screen_height, screen_width, nb_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(size_memory)

##########################################################


################### Select Action #########################
def select_action(state):
    global steps_done
    sample = random.random()  # Between 0 and 1
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:  #Action determined by the NN
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
示例#22
0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dqn_online = DQN(N_ACTIONS, STATE_SHAPE)
dqn_target = DQN(N_ACTIONS, STATE_SHAPE)
dqn_online.to(device)
dqn_target.to(device)
# optimizer = torch.optim.RMSprop(dqn_online.parameters(), lr=LR, momentum=0.95, eps=0.01) # paper used rmsprop
optimizer = torch.optim.Adam(dqn_online.parameters(), lr=LR)
if CKPT_ENABLED and os.path.exists(CKPT_FILENAME):
    progress = load_checkpoint(dqn_online, dqn_target, optimizer,
                               CKPT_FILENAME)
else:
    progress = []

dqn_target.eval()
mem_buffer = ReplayMemory(MEMORY_SIZE, STATE_SHAPE)

loss_fn = torch.nn.SmoothL1Loss()  # huber loss function
agent = DQNAgent(device, mem_buffer, dqn_online, dqn_target, optimizer,
                 loss_fn, GAMMA, BATCH_SIZE, UPDATE_ONLINE_INTERVAL,
                 UPDATE_TARGET_INTERVAL)

# training phase

# adjust these hyperparameters as necessary
num_episodes = 5000  # number of episodes to train for
explore_phase_length = 50000  # number of steps without any exploitation (paper used 50k)
epsilon = 1.0  # initial epsilon value (paper used 1.0)
epsilon_decrement_steps = 1000000  # how many steps to decrement epsilon to min value (paper used 1 million)
intermediate_epsilon = 0.1  # can be used to decay epsilon in two phases as recommended by openai (set equal to min_epsilon to disable)
min_epsilon = 0.01  # smallest possible value of epsilon (paper used 0.1 for dqn, 0.01 for ddqn)
示例#23
0
    HEIGHT = 84
    WIDTH = 84
    TEST_EPISODES = 10
    MODEL_PATH = 'dqn_model_scale.pt'

    # create environment
    # See wrappers.py
    env = create_atari_env("Breakout-v0",
                           episode_life=False,
                           frame_stack=True,
                           scale=True,
                           clip_rewards=False)
    epsilon = EPS_START
    steps_done = 0
    # initialize replay memory
    memory = ReplayMemory(MEMORY_SIZE)

    # create networks
    action_num = env.action_space.n
    policy_net = DQN(HEIGHT, WIDTH, action_num).to(device)
    target_net = DQN(HEIGHT, WIDTH, action_num).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    print(policy_net)

    # setup optimizer
    # optimizer = optim.RMSprop(policy_net.parameters())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # train model
    train(env, NUM_EPISODES)
def main(args):
    ''' compares 3 different agents

    Args:
        param1 (args) : command line argumente
    '''
    env = UnityEnvironment(file_name="Banana_Linux/Banana.x86_64",
                           no_graphics=True)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset()[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)

    agent = Duelling_DDQNAgent(args,
                               state_size=state_size,
                               action_size=action_size)
    mem = ReplayMemory(args, args.evaluation_size)
    scores = dqn(agent,
                 env,
                 brain_name,
                 mem,
                 args,
                 n_episodes=args.n_episodes,
                 eps_decay=args.eps_decay)
    save_and_plot(scores, args, 1)

    mem = ReplayMemory(args, args.evaluation_size)
    agent = Double_DQNAgent(args,
                            state_size=state_size,
                            action_size=action_size)
    scores2 = dqn(agent,
                  env,
                  brain_name,
                  mem,
                  args,
                  n_episodes=args.n_episodes,
                  eps_decay=args.eps_decay)
    save_and_plot(scores2, args, 2)

    args.priority_exponent = 0.8
    args.multi_step = 7
    args.update_every = 4
    args.noise = True
    mem = ReplayMemory(args, args.evaluation_size)
    agent = Agent(args, state_size=state_size, action_size=action_size)
    scores3 = dqn(agent,
                  env,
                  brain_name,
                  mem,
                  args,
                  n_episodes=args.n_episodes,
                  eps_decay=args.eps_decay)
    save_and_plot(scores3, args, 3)

    # plot the scores
    fig = plt.figure()
    fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores, label="Duelling Double DQN")
    plt.plot(np.arange(len(scores2)), scores2, label="Double DQN")
    plt.plot(np.arange(len(scores3)), scores3, label="Rainblow")
    plt.legend()
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()
示例#25
0
class StatusHandler(tornado.websocket.WebSocketHandler):

    agent = Agent(args)
    mem = ReplayMemory(args, args.memory_capacity, agent_count)
    agent_initialized = False
    cycle_counter = 1
    rgb_image_count = 1
    depth_image_count = 0
    depth_image_dim = 0
    ir_count = 1
    ground_count = 0
    compass_count = 1
    target_count = 1
    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)

    if args.mode_distribute:
        thread_event = threading.Event()

    state_cnn = torch.zeros(4, agent_count, 3, 128, 128)
    state_oth = torch.zeros(4, agent_count, 11)
    T = 0

    def open(self):
        print("open")

    def on_close(self):
        print("close")

    def on_message(self, message):
        print("received message")
        self.received_message(message)

    def callback(self, count):
        self.write_message('{"inventoryCount":"%d"}' % count)

    def send_action(self, action):
        dat = msgpack.packb({"command": "".join(map(str, action))})
        self.write_message(dat, binary=True)

    def received_message(self, m):
        payload = m
        dat = msgpack.unpackb(payload, encoding='utf-8')
        image = []
        depth = []
        agent_count = len(dat['image'])
        for i in range(agent_count):
            image.append(Image.open(io.BytesIO(bytearray(dat['image'][i]))))
            if (self.depth_image_count == 1):
                depth_dim = len(dat['depth'][0])
                temp = (Image.open(io.BytesIO(bytearray(dat['depth'][i]))))
                depth.append(
                    np.array(ImageOps.grayscale(temp)).reshape(
                        self.depth_image_dim))

        if (self.ir_count == 1):
            ir = dat['ir']
            ir_dim = len(ir[0])
        else:
            ir = []
            ir_dim = 0

        if (self.ground_count == 1):
            ground = dat['ground']
            ground_dim = len(ground[0])
        else:
            ground = []
            ground_dim = 0

        if (self.compass_count == 1):
            compass = dat['compass']
            compass_dim = len(compass[0])
        else:
            compass = []
            compass_dim = 0

        if (self.target_count == 1):
            target = dat['target']
            target_dim = len(target[0])
        else:
            target = []
            target_dim = 0
        self.agent.agent_count = agent_count
        observation = {
            "image": image,
            "depth": depth,
            "ir": ir,
            "ground": ground,
            "compass": compass,
            "target": target
        }
        reward = np.array(dat['reward'], dtype=np.float32)
        reward = torch.tensor(reward)
        end_episode = np.array(dat['endEpisode'], dtype=np.bool)
        print("get daze!")

        s_cnn = self.agent._observation_to_state_cnn(observation)
        self.state_cnn = torch.stack(
            (self.state_cnn[1], self.state_cnn[2], self.state_cnn[3], s_cnn))
        s_cnn_ = torch.cat([self.state_cnn[n] for n in range(4)], dim=1)

        s_oth = self.agent._observation_to_state_other(observation)
        self.state_oth = torch.stack(
            (self.state_oth[1], self.state_oth[2], self.state_oth[3], s_oth))
        s_oth_ = torch.cat([self.state_oth[n] for n in range(4)], dim=1)

        state = {'cnn': s_cnn_, 'oth': s_oth_}
        action = self.agent.act(state)
        action_ = action.numpy()
        self.send_action(action_)
        print(action)

        # for i in range(1000):
        self.mem.append({
            'cnn': s_cnn,
            'oth': s_oth
        }, action, reward, end_episode)
        if self.T > 1000:
            self.agent.learn(self.mem, self.T)
        self.T += 1
        if self.T % args.replay_frequency == 0:
            # self.agent.reset_noise()  # Draw a new set of noisy weights
            pass
        # Update target network
        if self.T % args.target_update == 0:
            self.agent.update_target_net()
示例#26
0
class MADDPG:
    def __init__(self, n_agents, dim_obs, dim_act, batch_size, capacity,
                 episodes_before_train):

        self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)]
        self.critics = [
            Critic(n_agents, dim_obs, dim_act) for i in range(n_agents)
        ]
        ifload = False
        if ifload:
            for i in range(2):
                name1 = "parameter/actor_v3" + str(i) + ".pth"
                name2 = "parameter/critic_v3" + str(i) + ".pth"
                #print(name1)
                self.actors[i].load_state_dict(
                    th.load(name1, map_location=th.device('cpu')))
                self.critics[i].load_state_dict(
                    th.load(name2, map_location=th.device('cpu')))
        self.actors_target = deepcopy(self.actors)
        self.critics_target = deepcopy(self.critics)
        ## Constrain........
        self.constrain = Constrain(dim_obs, 2)
        self.n_agents = n_agents
        self.n_states = dim_obs
        self.n_actions = dim_act
        self.memory = ReplayMemory(capacity)
        self.batch_size = batch_size
        self.use_cuda = th.cuda.is_available()
        self.episodes_before_train = episodes_before_train

        self.GAMMA = 0.95
        self.tau = 0.01

        self.var = [1.0 for i in range(n_agents)]
        self.critic_optimizer = [
            Adam(x.parameters(), lr=0.0008) for x in self.critics
        ]
        self.actor_optimizer = [
            Adam(x.parameters(), lr=0.0002) for x in self.actors
        ]

        self.constrain_optimizer = Adam(self.constrain.parameters(), lr=0.0006)

        if self.use_cuda:
            for x in self.actors:
                x.cuda()
            for x in self.critics:
                x.cuda()
            for x in self.actors_target:
                x.cuda()
            for x in self.critics_target:
                x.cuda()
            self.constrain.cuda()
        self.steps_done = 0
        self.episode_done = 0

    def update_policy(self):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor

        c_loss = []
        a_loss = []
        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))
            non_final_mask = ByteTensor(
                list(map(lambda s: s is not None, batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = th.stack(batch.states).type(FloatTensor)
            action_batch = th.stack(batch.actions).type(FloatTensor)
            reward_batch = th.stack(batch.rewards).type(FloatTensor)
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = th.stack([
                s for s in batch.next_states if s is not None
            ]).type(FloatTensor)

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)
            self.critic_optimizer[agent].zero_grad()
            #print("whole_action",whole_action)
            current_Q = self.critics[agent](whole_state, whole_action)

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:, i, :])
                for i in range(self.n_agents)
            ]
            non_final_next_actions = th.stack(non_final_next_actions)
            non_final_next_actions = (non_final_next_actions.transpose(
                0, 1).contiguous())

            target_Q = th.zeros(self.batch_size).type(FloatTensor)

            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.n_states),
                non_final_next_actions.view(-1, self.n_agents *
                                            self.n_actions)).squeeze()
            # scale_reward: to scale reward in Q functions

            target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + (
                reward_batch[:, agent].unsqueeze(1) * scale_reward)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

        if self.steps_done % 30 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.critics[i], self.tau)
                soft_update(self.actors_target[i], self.actors[i], self.tau)
                if self.steps_done % 300 == 0:
                    th.save(self.critics[i].state_dict(),
                            "parameter/critic_v3" + str(i) + ".pth")
                    th.save(self.actors[i].state_dict(),
                            "parameter/actor_v3" + str(i) + ".pth")
        return c_loss, a_loss

    def update_rule(self):
        if self.episode_done <= self.episodes_before_train:
            return None
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
        transitions = self.memory.sample(self.batch_size)
        batch = Experience(*zip(*transitions))

        state_batch = th.stack(batch.states).type(FloatTensor)
        action_batch = th.stack(batch.actions).type(FloatTensor)

        whole_state = state_batch.view(self.batch_size, -1)
        whole_action = action_batch.view(self.batch_size, -1)

        #for ag in range(self.n_agents):
        true_act, rules = self.select_rule_action(state_batch)

        if self.steps_done % 2 == 0:
            id = 0
        else:
            id = 1

        Q = []

        for ag in range(self.n_agents):
            Q.append(self.critics[ag](whole_state,
                                      Variable(th.Tensor(true_act))))
        Qsum = sum(Q)
        if self.steps_done % 600 == 0:
            print("true_act..", true_act[15])
            print("rule..", rules[id][15])
            print("Qsum..", Qsum[15])
        loss_r = -rules[id] * Qsum
        loss_r = loss_r.mean()
        loss_r.backward()
        self.constrain_optimizer.step()
        return loss_r

    def rule_act(self, state_batch):
        #true_act = []
        rules = []

        #obs = state_batch[:,0,:]
        #rule = self.constrain(obs)
        rules.append(self.constrain(state_batch[:, 0, :]))
        rules.append(self.constrain(state_batch[:, 1, :]))
        rule = rules[1].detach().numpy()
        #print(rule)

        action = [np.random.choice(2, 1, p=softmax(x))
                  for x in rule]  #[ [0] if x[0]>x[1] else [1] for x in rule ]
        #true_act.append(list(action))
        #true_act.append(list(action))

        true_act = [[x[0], x[0]] for x in action
                    ]  #action#np.array(true_act).reshape(self.batch_size,2)
        #print(true_act)
        #print(true_act[1])
        return true_act, rules
#

    def select_rule_action(self, state_batch):
        true_act = []
        rules = []
        for id in range(2):
            obs = state_batch[:, id, :]
            act = self.actors[id](obs)
            act = th.clamp(act, 0.0, 1.0)  ## ??
            act = act.detach().numpy()
            act_prob = [[1 - x[0], x[0]] for x in act]  #[ 1-act[0], act[0]]
            #act_prob = Variable(th.Tensor( act_prob))
            self.constrain_optimizer.zero_grad()
            rule = self.constrain(obs)
            rules.append(rule)
            rule0 = rule.detach().numpy()

            scale_act = [
                softmax(np.array(rule0[i]) * np.array(act_prob[i]))
                for i in range(self.batch_size)
            ]
            #scale_act = softmax(scale_act)
            action = [np.random.choice(2, 1, p=x) for x in scale_act]
            true_act.append(action)
        true_act = np.array(true_act).reshape(self.batch_size, 2)
        return true_act, rules

    def select_rule_action2(self, state_batch):
        true_act = []
        rules = []

        obs = state_batch[:, 0, :]
        rule = self.constrain(obs)
        rules.append(rule)
        rules.append(self.constrain(state_batch[:, 1, :]))
        rule = rule.detach().numpy()
        action = [np.random.choice(2, 1, p=x) for x in rule]
        #scale_act = [ softmax(np.array(rule[i])*np.array(act_prob[i])) for i in range(self.batch_size) ]
        #scale_act = softmax(scale_act)
        #action = [ np.random.choice(2,1,p = x) for x in scale_act ]
        true_act.append(action)
        true_act.append(action)
        true_act = np.array(true_act).reshape(self.batch_size, 2)
        return true_act, rules

    def getLaw(self, rule_prob, action_prob):
        forbidden_prob = [rule_prob[1], rule_prob[0]]
        for k in range(len(action_prob)):
            if action_prob[k] < forbidden_prob[k]:
                action_prob[k] = 0
        return action_prob

    def select_action(self, state_batch, rule_prob):
        # state_batch: n_agents x state_dim
        actions = th.zeros(self.n_agents, self.n_actions)
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
        for i in range(self.n_agents):
            sb = state_batch[i, :].detach()
            act = self.actors[i](sb.unsqueeze(0))  #.squeeze()

            act += th.from_numpy(
                np.random.randn(self.n_actions) *
                self.var[i]).type(FloatTensor)

            if self.episode_done > self.episodes_before_train and\
               self.var[i] > 0.05:
                self.var[i] *= 0.999998

            act = th.clamp(act, 0.0, 1.0)
            #print("act...",act)
            actProb = [1 - act[0][0], act[0][0]]
            action_prob = self.getLaw(rule_prob, actProb)

            at = np.argmax(np.array(action_prob))
            #print("at...",at)
            act = Variable(th.Tensor([[at]]))
            actions[i, :] = act
        self.steps_done += 1

        return actions

    def select_action2(self, state_batch):
        # state_batch: n_agents x state_dim
        actions = th.zeros(self.n_agents, self.n_actions)
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
        for i in range(self.n_agents):
            sb = state_batch[i, :].detach()
            act = self.actors[i](sb.unsqueeze(0))  #.squeeze()

            act += th.from_numpy(
                np.random.randn(self.n_actions) *
                self.var[i]).type(FloatTensor)

            if self.episode_done > self.episodes_before_train and\
               self.var[i] > 0.05:
                self.var[i] *= 0.999998
            act = th.clamp(act, 0.0, 1.0)

            actions[i, :] = act

        return actions

    def select_eval_action(self, state_batch, rule_prob, rule):
        # state_batch: n_agents x state_dim
        actions = th.zeros(self.n_agents, self.n_actions)
        #FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
        for i in range(self.n_agents):
            sta = Variable(th.Tensor([[0]]))
            act = self.actors[i](sta)  #.squeeze()
            act = th.clamp(act, 0.0, 1.0)
            if rule:
                actProb = [1 - act[0][0], act[0][0]]
                action_prob = self.getLaw(rule_prob, actProb)
                #if law:#act[0][0]>0.88:
                at = np.argmax(np.array(action_prob))
                #print("at... ",at)
                act = Variable(th.Tensor([[at]]))
            actions[i, :] = act
        self.steps_done += 1
        return actions
示例#27
0
# Agent
dqn = Agent(args, env)

# If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory
if args.model is not None and not args.evaluate:
    if not args.memory:
        raise ValueError('Cannot resume training without memory save path. Aborting...')
    elif not os.path.exists(args.memory):
        raise ValueError(
            'Could not find memory file at {path}. Aborting...'.format(path=args.memory))

    mem = load_memory(args.memory, args.disable_bzip_memory)

else:
    mem = ReplayMemory(args, args.memory_capacity, env)

priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start)

# # Construct validation memory
# val_mem = ReplayMemory(args, args.evaluation_size, test_env)
# T, done = 0, True
# while T < args.evaluation_size:
#     if done:
#         state, done = env.reset(), False

#     next_state, _, done, _ = env.step(np.random.randint(0, n_actions))
#     val_mem.append(state, -1, 0.0, done)
#     state = next_state
#     T += 1
示例#28
0
class Agent():
    def __init__(self, action_size):
        self.action_size = action_size
        
        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 1000000
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)
        
        self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)

        # Initialize a target network and initialize the target network to the policy net
        ### CODE ###
        self.target_net = DQN(action_size).to(device)
        self.update_target_net()
        self.target_net.eval()

    def load_policy_net(self, path):
        self.policy_net = torch.load(path)           

    # after some time interval update the target net to be same with policy net
    def update_target_net(self):
        ### CODE ###
        self.target_net.load_state_dict(self.policy_net.state_dict())


    """Get action using policy net using epsilon-greedy policy"""
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            ### CODE #### (copy over from agent.py!)
            a = torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long)

        else:
            ### CODE #### (copy over from agent.py!)
            with torch.no_grad():
                state = torch.from_numpy(state).reshape(1,4,84,84).to(device)
                a = self.policy_net(state).max(1)[1].view(1, 1)
        return a

    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        states = torch.from_numpy(states).cuda()
        actions = list(mini_batch[1])
        actions = torch.LongTensor(actions).cuda()
        rewards = list(mini_batch[2])
        rewards = torch.FloatTensor(rewards).cuda()
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        dones = mini_batch[3] # checks if the game is over
        musk = torch.tensor(list(map(int, dones==False)),dtype=torch.uint8)
        
        # Your agent.py code here with double DQN modifications
        ### CODE ###
        curr_Q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        next_state_values = torch.zeros(32, device=device)
        next_states = torch.from_numpy(next_states).to(device)
        next_state_values[musk==1] = self.target_net(next_states[musk==1]).max(1)[0].detach()
        #next_state_values[musk] = self.target_net(next_states[musk]).detach().gather(1, self.policy_net(next_states[musk]).argmax(1).unsqueeze(1)).squeeze(1)

        target_Q = next_state_values * self.discount_factor + rewards
        loss = F.smooth_l1_loss(curr_Q, target_Q)

        self.optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10)
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        self.scheduler.step()

        
示例#29
0
class Agent(object):
    """
    The learner and decision maker.
    Based on the DQN algorithm - ref Mnih et. al 2015
    i.e. Q-Learning with experience replay & a target network

    All calls to tensorflow are wrapped into methods.

    Support for environments is currently manually configured.
    """
    def __init__(self,
                 env,
                 discount,
                 tau,
                 sess,
                 total_steps,
                 batch_size,
                 layers,
                 learning_rate,
                 epsilon_decay_fraction=0.5,
                 memory_fraction=0.25,
                 process_observation=False,
                 process_target=False,
                 **kwargs):

        self.env = env
        self.discount = discount
        self.tau = tau
        self.sess = sess
        self.batch_size = batch_size

        #  number of steps where epsilon is decayed from 1.0 to 0.1
        decay_steps = total_steps * epsilon_decay_fraction
        self.epsilon_getter = EpsilonDecayer(decay_steps)

        #  the counter is stepped up every time we act or learn
        self.counter = 0

        if (repr(env) == '<TimeLimit<CartPoleEnv<CartPole-v0>>>') or \
            (repr(env) == '<TimeLimit<CartPoleEnv<CartPole-v1>>>'):

            obs_space_shape = env.observation_space.shape
            #  the shape of the gym Discrete space is the number of actions
            #  not the shape of a single action array
            #  create a tuple to specify the action space
            self.action_space_shape = (1, )
            #  a list of all possible actions
            self.actions = [act for act in range(env.action_space.n)]

        elif repr(env) == '<TimeLimit<PendulumEnv<Pendulum-v0>>>':
            raise ValueError('Build in progress')
            obs_space_shape = env.observation_space.shape
            self.action_space_shape = env.action_space.shape
            self.actions = np.linspace(env.action_space.low,
                                       env.action_space.high,
                                       num=20,
                                       endpoint=True).tolist()

        elif repr(env) == '<TimeLimit<MountainCarEnv<MountainCar-v0>>>':
            obs_space_shape = env.observation_space.shape
            self.action_space_shape = (1, )
            self.actions = [act for act in range(env.action_space.n)]
        else:
            raise ValueError('Environment not supported')

        self.memory = ReplayMemory(obs_space_shape,
                                   self.action_space_shape,
                                   size=int(total_steps * memory_fraction))

        model_config = {
            'input_shape': obs_space_shape,
            'output_shape': (len(self.actions), ),
            'layers': layers,
            'learning_rate': learning_rate
        }

        #  the two approximations of Q(s,a)
        #  use the same config dictionary for both
        self.online = Qfunc(model_config, scope='online')
        self.target = Qfunc(model_config, scope='target')

        #  set up the operations to copy the online network parameters to
        #  the target network
        self.update_ops = self.make_target_net_update_ops()

        if process_observation:
            self.observation_processor = Normalizer(obs_space_shape[0])

        if process_target:
            self.target_processor = Normalizer(1)

        self.acting_writer = tf.summary.FileWriter('./results/acting',
                                                   graph=self.sess.graph)

        self.learning_writer = tf.summary.FileWriter('./results/learning',
                                                     graph=self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self.update_target_network(tau=1.0)

    def __repr__(self):
        return '<class DQN Agent>'

    def make_target_net_update_ops(self):
        """
        Creates the Tensorflow operations to update the target network.

        The two lists of Tensorflow Variables (one for the online net, one
        for the target net) are iterated over together and new weights
        are assigned to the target network
        """
        with tf.variable_scope('update_target_network'):

            self.tf_tau = tf.placeholder(tf.float32, shape=(), name='tau')

            update_ops = []
            for online, target in zip(self.online.params, self.target.params):

                o_name, t_name = online.name.split('/')[1:], target.name.split(
                    '/')[1:]
                print('copying {} to {}'.format(o_name, t_name))

                assert o_name == t_name
                val = tf.add(tf.multiply(online, self.tf_tau),
                             tf.multiply(target, 1 - self.tf_tau))

                operation = target.assign(val)
                update_ops.append(operation)

        return update_ops

    def update_target_network(self, tau=None):
        """
        Updates the target network weights using the parameter tau

        Relies on the sorted lists of tf.Variables kept in each Qfunc object
        """
        if tau is None:
            tau = self.tau
        logging.debug('updating target net at count {}'.format(self.counter))

        self.sess.run(self.update_ops, {self.tf_tau: tau})

    def remember(self, observation, action, reward, next_observation, done):
        """
        Store experience in the agent's memory.

        args
            observation (np.array)
            action (np.array)
            reward (np.array)
            next_observation (np.array)
            done (np.array)
        """
        if hasattr(self, 'observation_processor'):
            observation = self.observation_processor(observation)
            next_observation = self.observation_processor(next_observation)

        return self.memory.remember(observation, action, reward,
                                    next_observation, done)

    def predict_target(self, observations):
        """
        Target network is used to predict the maximum discounted expected
        return for the next_observation as experienced by the agent

        args
            observations (np.array)

        returns
            max_q (np.array) shape=(batch_size, 1)
        """
        fetches = [
            self.target.q_values, self.target.max_q, self.target.acting_summary
        ]

        feed_dict = {self.target.observation: observations}

        q_vals, max_q, summary = self.sess.run(fetches, feed_dict)
        self.learning_writer.add_summary(summary, self.counter)

        logging.debug('predict_target - next_obs {}'.format(observations))
        logging.debug('predict_target - q_vals {}'.format(q_vals))
        logging.debug('predict_target - max_q {}'.format(max_q))

        return max_q.reshape(observations.shape[0], 1)

    def predict_online(self, observation):
        """
        We use our online network to choose actions.

        args
            observation (np.array) a single observation

        returns
            action
        """
        obs = observation.reshape((1, *self.env.observation_space.shape))

        fetches = [
            self.online.q_values, self.online.max_q,
            self.online.optimal_action_idx, self.online.acting_summary
        ]

        feed_dict = {self.online.observation: obs}
        q_values, max_q, action_idx, summary = self.sess.run(
            fetches, feed_dict)
        self.acting_writer.add_summary(summary, self.counter)

        max_q = max_q.flatten()[0]
        max_q_sum = tf.Summary(
            value=[tf.Summary.Value(tag='max_q_acting', simple_value=max_q)])

        self.acting_writer.add_summary(max_q_sum, self.counter)
        self.acting_writer.flush()

        #  index at zero because TF returns an array
        action = self.actions[action_idx[0]]

        logging.debug('predict_online - observation {}'.format(obs))
        logging.debug('predict_online - pred_q_values {}'.format(q_values))
        logging.debug('predict_online - max_q {}'.format(max_q))
        logging.debug('predict_online - action_index {}'.format(action_idx))
        logging.debug('predict_online - action {}'.format(action))

        return action

    def act(self, observation):
        """
        Our agent attempts to manipulate the world.

        Acting according to epsilon greedy policy.

        args
            observation (np.array)

        returns
            action (np.array)
        """
        self.counter += 1
        epsilon = self.epsilon_getter.epsilon
        logging.debug('epsilon is {}'.format(epsilon))

        if epsilon > random_uniform():
            action = self.env.action_space.sample()
            logging.debug('acting randomly - action is {}'.format(action))
        else:
            action = self.predict_online(observation)
            logging.debug('acting optimally action is {}'.format(action))

        epsilon_sum = tf.Summary(
            value=[tf.Summary.Value(tag='epsilon', simple_value=epsilon)])
        self.acting_writer.add_summary(epsilon_sum, self.counter)
        self.acting_writer.flush()

        # return np.array(action).reshape(1, *self.action_space_shape)
        return action

    def learn(self):
        """
        Our agent attempts to make sense of the world.

        A batch sampled using experience replay is used to train the online
        network using targets from the target network.

        returns
            train_info (dict)
        """
        batch = self.memory.get_batch(self.batch_size)
        observations = batch['observations']
        actions = batch['actions']
        rewards = batch['rewards']
        terminals = batch['terminal']
        next_observations = batch['next_observations']

        next_obs_q = self.predict_target(next_observations)

        #  if next state is terminal, set the value to zero
        next_obs_q[terminals] = 0

        #  creating a target for Q(s,a) using the Bellman equation
        rewards = rewards.reshape(rewards.shape[0], 1)
        target = rewards + self.discount * next_obs_q

        if hasattr(self, 'target_processor'):
            target = self.target_processor(target)

        indicies = np.zeros((actions.shape[0], 1), dtype=int)

        for arr, action in zip(indicies, actions):
            idx = self.actions.index(action)
            arr[0] = idx

        rng = np.arange(actions.shape[0]).reshape(actions.shape[0], 1)
        indicies = np.concatenate([rng, indicies], axis=1)

        fetches = [
            self.online.q_values, self.online.q_value, self.online.loss,
            self.online.train_op, self.online.learning_summary
        ]

        feed_dict = {
            self.online.observation: observations,
            self.online.action: indicies,
            self.online.target: target
        }

        q_vals, q_val, loss, train_op, train_sum = self.sess.run(
            fetches, feed_dict)

        logging.debug('learning - observations {}'.format(observations))

        logging.debug('learning - rewards {}'.format(rewards))
        logging.debug('learning - terminals {}'.format(terminals))
        logging.debug('learning - next_obs_q {}'.format(next_obs_q))

        logging.debug('learning - actions {}'.format(actions))
        logging.debug('learning - indicies {}'.format(indicies))
        logging.debug('learning - q_values {}'.format(q_vals))
        logging.debug('learning - q_value {}'.format(q_val))

        logging.debug('learning - target {}'.format(target))
        logging.debug('learning - loss {}'.format(loss))

        self.learning_writer.add_summary(train_sum, self.counter)

        self.update_target_network()

        return {'loss': loss}
示例#30
0
class Agent():
    def __init__(self, action_size):
        self.load_model = True

        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 100000
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)
        self.target_net = DQN(action_size)
        self.target_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate)

        # initialize target net
        self.update_target_net()

        if self.load_model:
            self.policy_net = torch.load('./save_model/ec1_breakout_dqn')

    # after some time interval update the target net to be same with policy net
    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    """Get action using policy net using epsilon-greedy policy"""
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            ### CODE ####
            # Choose a random action
            a = torch.tensor([[random.randrange(3)]])
            if torch.cuda.is_available():
                a = a.cuda()
        else:
            ### CODE ####
            state = torch.tensor(state).unsqueeze(0)
            if torch.cuda.is_available():
                state = state.cuda()
            a = self.policy_net(state).max(1)[1]
            a = a.view(1, 1)
        return a

    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        actions = list(mini_batch[1])
        rewards = list(mini_batch[2])
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        dones = mini_batch[3] # checks if the game is over
        
        # Compute Q(s_t, a) - Q of the current state
        ### CODE ####
        states = torch.tensor(states, device=device)
        actions = torch.tensor(actions, device=device, dtype=torch.long).view(-1, 1)
        next_states = torch.tensor(next_states, device=device)
        rewards = torch.tensor(rewards, device=device)

        a = self.policy_net(states)
        Q = a.gather(1, actions).view(-1)

        # Compute Q function of next state
        ### CODE ####
        Q_next = self.target_net(next_states)

        # Find maximum Q-value of action at next state from target net
        ### CODE ####
        Q_next = Q_next.max(1)[0].detach()

        # Compute the Huber Loss
        ### CODE ####
        Huber_loss = F.smooth_l1_loss(Q, (Q_next * self.discount_factor + rewards))
        
        # Optimize the model 
        ### CODE ####
        self.optimizer.zero_grad()
        Huber_loss.backward()
        for parameter in self.policy_net.parameters():
            parameter.grad.data.clamp_(-1, 1)
        self.optimizer.step()