Пример #1
0
class Agent:
    def __init__(self, env, env_w, device, config: Config):
        self.env = env
        self.env_w = env_w
        self.device = device
        self.cfg = config
        self.n_actions = config.n_actions
        self.policy_net = config.policy_net
        self.target_net = config.target_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.episode_durations = []

    def select_action(self, state):
        self.steps_done += 1
        sample = random.random()
        eps_threshold = self.cfg.EPS_END + (self.cfg.EPS_START - self.cfg.EPS_END) * \
            math.exp(-1. * self.steps_done / self.cfg.EPS_DECAY)
        if sample < eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                # action = self.policy_net(state).max(1)[1]
                action = self.policy_net(state).argmax() % self.n_actions
        else:
            action = random.randrange(self.n_actions)
        return torch.tensor([[action]], device=self.device, dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.cfg.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.cfg.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.cfg.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.cfg.GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def step(self, i_episode):
        # Initialize the environment and state
        self.env.reset()
        last_screen = self.env_w.get_screen()
        current_screen = self.env_w.get_screen()
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = self.select_action(state)
            obs, reward, done, obs_ = self.env.step(action.item())
            # reward = torch.tensor([reward], device=self.device)
            reward = torch.tensor([-abs(obs[2])], device=self.device, dtype=torch.float)

            # Observe new state
            last_screen = current_screen
            current_screen = self.env_w.get_screen()
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            self.memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            self.optimize_model()
            if done:
                self.episode_durations.append(t + 1)
                self.env_w.plot_durations(self.episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % self.cfg.TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
Пример #2
0
class DQN:
    def __init__(self, env, hparams):
        self.hparams = hparams
        self.env = env
        self.n = env.action_space.n
        self.Q = DCNN(4, self.n)
        self.T = DCNN(4, self.n)
        self.T.load_state_dict(self.Q.state_dict())
        self.T.eval()
        self.memory = ReplayMemory(hparams.memory_size)
        self.steps = 0
        self.state = env.reset()
        self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                             lr=hparams.lr,
                                             momentum=hparams.momentum)
        self.n_episodes = 0

    @torch.no_grad()
    def select_action(self):
        hparams = self.hparams
        start = hparams.eps_start
        end = hparams.eps_end
        time = hparams.eps_time
        steps = self.steps
        self.steps += 1
        if steps < time:
            epsilon = start - (start - end) * steps / time
        else:
            epsilon = end

        sample = random.random()

        if sample > epsilon:
            return self.Q(s2t(self.state).to(device)).max(1)[1].item()
        else:
            return self.env.action_space.sample()

    def sample_step(self, fs_min=2, fs_max=6):
        """repeats a single action between fs_min and fs_max (inclusive) times"""
        fs = random.randint(fs_min, fs_max)
        action = self.select_action()
        r = 0
        for _ in range(fs):
            new_state, reward, done, _ = self.env.step(action)
            self.memory.push(self.state, action,
                             new_state if not done else None, reward)
            r += reward
            self.state = self.env.reset() if done else new_state
            if done:
                self.n_episodes += 1
        return r

    def optimize(self):
        hparams = self.hparams
        transitions = self.memory.sample(hparams.batch_size)
        batch = Transition(*zip(*transitions))
        states = torch.cat([s2t(state) for state in batch.state]).to(device)
        actions = torch.tensor(batch.action).unsqueeze(1).to(device)
        target_values = torch.tensor(
            batch.reward).unsqueeze(1).to(device).float()
        non_terminal_next_states = torch.cat([
            s2t(state) for state in batch.next_state if state is not None
        ]).to(device)
        non_terminal_mask = torch.tensor([
            state is not None for state in batch.next_state
        ]).to(device).unsqueeze(1)

        values = self.Q(states).gather(1, actions).float()
        target_values[non_terminal_mask] += hparams.gamma * self.T(
            non_terminal_next_states).detach().max(1)[0].float()

        #print(values.dtype,target_values.dtype)
        loss = F.smooth_l1_loss(values, target_values)
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.Q.parameters():
            param.grad.data.clamp_(-1, 1)  # maybe try sign_?

        self.optimizer.step()
        return loss
def train_dqn(env,
              num_steps,
              *,
              replay_size,
              batch_size,
              exploration,
              gamma,
              train_freq=1,
              print_freq=100,
              target_network_update_freq=500,
              t_learning_start=1000):
    """
    DQN algorithm.

    Compared to previous training procedures, we will train for a given number
    of time-steps rather than a given number of episodes.  The number of
    time-steps will be in the range of millions, which still results in many
    episodes being executed.

    Args:
        - env: The openai Gym environment
        - num_steps: Total number of steps to be used for training
        - replay_size: Maximum size of the ReplayMemory
        - batch_size: Number of experiences in a batch
        - exploration: a ExponentialSchedule
        - gamma: The discount factor

    Returns: (saved_models, returns)
        - saved_models: Dictionary whose values are trained DQN models
        - returns: Numpy array containing the return of each training episode
        - lengths: Numpy array containing the length of each training episode
        - losses: Numpy array containing the loss of each training batch
    """
    # check that environment states are compatible with our DQN representation
    assert (isinstance(env.observation_space, gym.spaces.Box)
            and len(env.observation_space.shape) == 1)

    # get the state_size from the environment
    state_size = env.observation_space.shape[0]

    # initialize the DQN and DQN-target models
    dqn_model = DQN(state_size, env.action_space.n)
    dqn_target = DQN.custom_load(dqn_model.custom_dump())

    # initialize the optimizer
    optimizer = torch.optim.Adam(dqn_model.parameters(), lr=5e-4)

    # initialize the replay memory
    memory = ReplayMemory(replay_size, state_size)

    # initiate lists to store returns, lengths and losses
    rewards = []
    returns = []
    lengths = []
    losses = []

    last_100_returns = deque(maxlen=100)
    last_100_lengths = deque(maxlen=100)

    # initiate structures to store the models at different stages of training
    saved_models = {}

    i_episode = 0
    t_episode = 0

    state = env.reset()

    # iterate for a total of `num_steps` steps
    for t_total in range(num_steps):
        # use t_total to indicate the time-step from the beginning of training

        if t_total >= t_learning_start:
            eps = exploration.value(t_total - t_learning_start)
        else:
            eps = 1.0
        action = select_action_epsilon_greedy(dqn_model, state, eps, env)
        next_state, reward, done, _ = env.step(action)
        memory.add(state, action, reward, next_state, done)

        rewards.append(reward)
        state = next_state

        if t_total >= t_learning_start and t_total % train_freq == 0:
            batch = memory.sample(batch_size)
            loss = train_dqn_batch(optimizer, batch, dqn_model, dqn_target,
                                   gamma)
            losses.append(loss)

        # update target network
        if t_total >= t_learning_start and t_total % target_network_update_freq == 0:
            dqn_target.load_state_dict(dqn_model.state_dict())

        if done:

            # Calculate episode returns
            G = 0
            for i in range(len(rewards)):
                G += rewards[i] * pow(gamma, i)

            # Collect results
            lengths.append(t_episode + 1)
            returns.append(G)

            last_100_returns.append(G)
            last_100_lengths.append(t_episode + 1)

            if i_episode % print_freq == 0:
                logger.record_tabular("time step", t_total)

                logger.record_tabular("episodes", i_episode)
                logger.record_tabular("step", t_episode + 1)
                logger.record_tabular("return", G)
                logger.record_tabular("mean reward", np.mean(last_100_returns))
                logger.record_tabular("mean length", np.mean(last_100_lengths))

                logger.record_tabular("% time spent exploring", int(100 * eps))
                logger.dump_tabular()

            # End of episode so reset time, reset rewards list
            t_episode = 0
            rewards = []

            # Environment terminated so reset it
            state = env.reset()

            # Increment the episode index
            i_episode += 1

        else:
            t_episode += 1

    return (
        dqn_model,
        np.array(returns),
        np.array(lengths),
        np.array(losses),
    )
Пример #4
0
class Learner(object):

	def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem):
		self.params = params
		self.param_set_id = param_set_id
		self.status_dict = status_dict
		self.shared_state = shared_state
		self.remote_mem = remote_mem

		gpu = 0
		torch.cuda.set_device(gpu)

		ep = params['env']
		ap = params['actor']
		lp = params['learner']
		rmp = params["replay_memory"]

		model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)'
		optimizer_formula = lp["optimizer"].format('self.Q.parameters()')

		self.conn = psycopg2.connect(params["db"]["connection_string"])
		self.conn.autocommit = True
		self.cur = self.conn.cursor()

		self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu")
		self.state_shape = ep['state_shape']
		self.batch_size = lp['replay_sample_size']
		self.action_dim = ep['action_dim']
		self.q_target_sync_freq = lp['q_target_sync_freq']
		self.num_q_updates = 0
		self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device)
		self.Q = eval(model_formula)
		self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q
		self.optimizer = eval(optimizer_formula)
		self.replay_memory = ReplayMemory(rmp)

		self.train_num = 0
		self.model_file_name = lp['load_saved_state']
		if self.model_file_name and os.path.isfile(self.model_file_name):
			print(f'Loading {self.model_file_name}')
			saved_state = torch.load(self.model_file_name)
			self.Q.load_state_dict(saved_state['module'])
			self.optimizer.load_state_dict(saved_state['optimizer'])
			self.train_num = saved_state['train_num']

		self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
		    self.Q_target.state_dict())
		self.status_dict['Q_state_dict_stored'] = True

		self.last_Q_state_dict_id = 1
		self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
		self.status_dict['train_num'] = self.train_num

		self.gamma_n = params['actor']['gamma']**params['actor']['num_steps']

	def state_dict_to_cpu(self, state_dict):
		d = OrderedDict()
		for k, v in state_dict.items():
			d[k] = v.cpu()
		return d

	def add_experience_to_replay_mem(self):
		while self.remote_mem.qsize():
			priorities, batch = self.remote_mem.get()
			self.replay_memory.add(priorities, batch)

	def compute_loss_and_priorities(self, batch_size):
		indices, n_step_transition_batch, before_priorities = self.replay_memory.sample(batch_size)

		s = n_step_transition_batch[0].to(self.device)
		a = n_step_transition_batch[1].to(self.device)
		r = n_step_transition_batch[2].to(self.device)
		a_latest = n_step_transition_batch[3].to(self.device)
		s_latest = n_step_transition_batch[4].to(self.device)
		terminal = n_step_transition_batch[5].to(self.device)

		q = self.Q(s)
		q_a = q.take(self.take_offsets + a).squeeze()

		with torch.no_grad():
			self.Q_target.eval()
			Gt = r + (1.0 - terminal) * self.gamma_n * self.Q_target(s_latest).take(self.take_offsets + a_latest).squeeze()
			td_error = Gt - q_a

		loss = F.smooth_l1_loss(q_a, Gt)
		# loss = td_error**2 / 2

		# Compute the new priorities of the experience
		after_priorities = td_error.data.abs().cpu().numpy()
		self.replay_memory.set_priorities(indices, after_priorities)

		return loss, q, before_priorities, after_priorities, indices

	def update_Q(self, loss):
		self.optimizer.zero_grad()
		loss.backward()
		self.optimizer.step()
		self.num_q_updates += 1

		if self.num_q_updates % self.q_target_sync_freq == 0:
			self.Q_target.load_state_dict(self.Q.state_dict())
			print(f'Target Q synchronized.')
			return True
		else:
			return False

	def learn(self):
		t = tables.LearnerData()
		record_type = t.get_record_type()
		record_insert = t.get_insert()
		cur = self.cur
		param_set_id = self.param_set_id
		now = datetime.datetime.now
		step_num = 0
		target_sync_num = 0
		send_param_num = 0
		min_replay_mem_size = self.params['learner']["min_replay_mem_size"]

		print('learner waiting for replay memory.')
		while self.replay_memory.size() <= min_replay_mem_size:
			self.add_experience_to_replay_mem()
			time.sleep(0.01)
		step_num = 0
		print('learner start')
		while not self.status_dict['quit']:
			self.add_experience_to_replay_mem()
			# 4. Sample a prioritized batch of transitions
			# 5. & 7. Apply double-Q learning rule, compute loss and experience priorities
			# 8. Update priorities
			loss, q, before_priorities, after_priorities, indices = self.compute_loss_and_priorities(self.batch_size)
			if step_num % 10 == 0:
				print(f'loss : {loss}')
			#print("\nLearner: step_num=", step_num, "loss:", loss, "RPM.size:", self.replay_memory.size(), end='\r')
			# 6. Update parameters of the Q network(s)
			if self.update_Q(loss):
				target_sync_num += 1
			if step_num % 5 == 0:
				self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
				    self.Q_target.state_dict())
				self.last_Q_state_dict_id += 1
				self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
				print('Send params to actors.')
				send_param_num += 1

			# 9. Periodically remove old experience from replay memory
			step_num += 1
			self.train_num += 1
			self.status_dict['train_num'] = self.train_num

			# DBへデータ登録
			r = record_type(param_set_id, now(), self.train_num,
			                step_num, loss.item(), q[0].tolist(), before_priorities.tolist(), after_priorities.tolist(),
			                indices.tolist(), target_sync_num, send_param_num)
			record_insert(cur, r)

		print('learner end')

		state_dict = {'module': self.Q.state_dict(), 'optimizer': self.optimizer.state_dict(), 'train_num': self.train_num}
		torch.save(state_dict, self.model_file_name)
Пример #5
0
class DDQN_separated_net(Agent_segment):
    def __init__(self, epsilon=0.3, memory_size=300, batch_size=16, model=navigation_model,
                 target_update_interval=1,
                 tau=0.005):
        super(DDQN_separated_net, self).__init__(epsilon=epsilon,
                                                 random_can_stop=False)

        # Memory
        self.memory = ReplayMemory(memory_size)

        # Batch size when learning
        self.batch_size = batch_size

        # number of time steps before an update of the delayed target Q network
        self.target_update_interval = target_update_interval

        # soft update weight of the delayed Q network
        self.tau = tau

    def learned_act(self, s, pred_oracle=True, online=False):
        if online:
            if pred_oracle:
                return torch.cat([self.model(s), oracle(s).unsqueeze(1)], 1)
        with torch.no_grad():
            if pred_oracle:
                return torch.cat([self.target_model(s), oracle(s).unsqueeze(1)], 1)
                # to do without oracle

    def reinforce(self, s_, a_, n_s_, r_, game_over_, env_steps_):
        # Two steps: first memorize the states, second learn from the pool

        self.memory.remember(s_, a_, n_s_, r_, game_over_)

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)


        # non_final_mask = torch.tensor(torch.cat(batch.game_over), device=device)==False
        non_final_mask = torch.cat(batch.game_over) == False

        non_final_next_states = torch.cat(batch.next_state)[non_final_mask]
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action).view(-1, 2)
        reward_batch = torch.cat(batch.reward)
        # non_final_next_states = torch.cat(batch.next_state)[non_final_index]

        # print(state_batch.shape)
        state_values = self.learned_act(state_batch, online=True)
        state_action_values = torch.cat(
            [s[a[0].item(), a[1].item()].unsqueeze(0) for s, a in zip(state_values, batch.action)])

        next_state_values = torch.zeros(self.batch_size, device=device)

        if len(non_final_next_states) > 0:
            with torch.no_grad():
                argmax_online = (self.learned_act(non_final_next_states, online=True)).view(non_final_next_states.shape[0],-1).argmax(1)
                # print(torch.tensor(range(self.batch_size), device=device)[non_final_mask])
                # print(self.learned_act(non_final_next_states, online=False).view(-1, 2*SEGMENT_LENGTH).shape)
                next_state_values[non_final_mask] = \
                self.learned_act(non_final_next_states, online=False).view(non_final_next_states.shape[0], -1)[
                    range(len(non_final_next_states)), argmax_online]

        expected_state_action_values = next_state_values + reward_batch

        loss = F.smooth_l1_loss(state_action_values[non_final_mask],
                                expected_state_action_values[non_final_mask])  # .unsqueeze(1))
        # loss = F.mse_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask])

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter
            param.grad.data.clamp_(-1e-6, 1e-6)
        self.optimizer.step()

        if env_steps_ % self.target_update_interval == 0:
            soft_update(self.target_model, self.model, self.tau)

        return float(loss)

    def save_model(self, model_path='model.pickle'):
        try:
            torch.save(self.model, model_path)
        except:
            pass

    def load_model(self, model_path='model.pickle', local=True):
        if local:
            self.model = navigation_model()
            self.target_model = navigation_model()
            hard_update(self.target_model, self.model)
        else:
            self.model = torch.load('model.pickle')
            self.target_model = torch.load('model.pickle')
        if torch.cuda.is_available():
            print('Using GPU')
            self.model.cuda()
            self.target_model.cuda()
        else:
            print('Using CPU')
        self.optimizer = optim.RMSprop(self.model.parameters(), lr=1e-5)