Python ReplayMemory.sample примеры использования

Язык программирования: Python

Пространство имен/Пакет: replay

Класс/Тип: ReplayMemory

Метод/Функция: sample

Примеров на hotexamples.com: 5

Python ReplayMemory.sample - 5 примеров найдено. Это лучшие примеры Python кода для replay.ReplayMemory.sample, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayMemory(21)

sample(5)

push(4)

add(4)

save_buffer(3)

add_experience(3)

load_buffer(2)

gen_sample(2)

get_last_n_states(1)

online_shrink_frame_size(1)

add_all(1)

rewards(1)

reset_unique(1)

remember(1)

get_history_minibatch(1)

print_replay(1)

percentages_rewards(1)

percentages_actions(1)

num_rewards(1)

get_last_state(1)

num_examples(1)

num_actions(1)

_get_state(1)

is_valid_index(1)

init_unique(1)

init_click_sample(1)

init_click(1)

get_unique_minibatch(1)

get_minibatch(1)

set_priorities(1)

Пример #1

Показать файл

class Agent:
    def __init__(self, env, env_w, device, config: Config):
        self.env = env
        self.env_w = env_w
        self.device = device
        self.cfg = config
        self.n_actions = config.n_actions
        self.policy_net = config.policy_net
        self.target_net = config.target_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.episode_durations = []

    def select_action(self, state):
        self.steps_done += 1
        sample = random.random()
        eps_threshold = self.cfg.EPS_END + (self.cfg.EPS_START - self.cfg.EPS_END) * \
            math.exp(-1. * self.steps_done / self.cfg.EPS_DECAY)
        if sample < eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                # action = self.policy_net(state).max(1)[1]
                action = self.policy_net(state).argmax() % self.n_actions
        else:
            action = random.randrange(self.n_actions)
        return torch.tensor([[action]], device=self.device, dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.cfg.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.cfg.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.cfg.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.cfg.GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def step(self, i_episode):
        # Initialize the environment and state
        self.env.reset()
        last_screen = self.env_w.get_screen()
        current_screen = self.env_w.get_screen()
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = self.select_action(state)
            obs, reward, done, obs_ = self.env.step(action.item())
            # reward = torch.tensor([reward], device=self.device)
            reward = torch.tensor([-abs(obs[2])], device=self.device, dtype=torch.float)

            # Observe new state
            last_screen = current_screen
            current_screen = self.env_w.get_screen()
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            self.memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            self.optimize_model()
            if done:
                self.episode_durations.append(t + 1)
                self.env_w.plot_durations(self.episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % self.cfg.TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

Пример #2

Показать файл

Файл: DQN.py Проект: gsitcia/Deep-Reinforcement-Learning

class DQN:
    def __init__(self, env, hparams):
        self.hparams = hparams
        self.env = env
        self.n = env.action_space.n
        self.Q = DCNN(4, self.n)
        self.T = DCNN(4, self.n)
        self.T.load_state_dict(self.Q.state_dict())
        self.T.eval()
        self.memory = ReplayMemory(hparams.memory_size)
        self.steps = 0
        self.state = env.reset()
        self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                             lr=hparams.lr,
                                             momentum=hparams.momentum)
        self.n_episodes = 0

    @torch.no_grad()
    def select_action(self):
        hparams = self.hparams
        start = hparams.eps_start
        end = hparams.eps_end
        time = hparams.eps_time
        steps = self.steps
        self.steps += 1
        if steps < time:
            epsilon = start - (start - end) * steps / time
        else:
            epsilon = end

        sample = random.random()

        if sample > epsilon:
            return self.Q(s2t(self.state).to(device)).max(1)[1].item()
        else:
            return self.env.action_space.sample()

    def sample_step(self, fs_min=2, fs_max=6):
        """repeats a single action between fs_min and fs_max (inclusive) times"""
        fs = random.randint(fs_min, fs_max)
        action = self.select_action()
        r = 0
        for _ in range(fs):
            new_state, reward, done, _ = self.env.step(action)
            self.memory.push(self.state, action,
                             new_state if not done else None, reward)
            r += reward
            self.state = self.env.reset() if done else new_state
            if done:
                self.n_episodes += 1
        return r

    def optimize(self):
        hparams = self.hparams
        transitions = self.memory.sample(hparams.batch_size)
        batch = Transition(*zip(*transitions))
        states = torch.cat([s2t(state) for state in batch.state]).to(device)
        actions = torch.tensor(batch.action).unsqueeze(1).to(device)
        target_values = torch.tensor(
            batch.reward).unsqueeze(1).to(device).float()
        non_terminal_next_states = torch.cat([
            s2t(state) for state in batch.next_state if state is not None
        ]).to(device)
        non_terminal_mask = torch.tensor([
            state is not None for state in batch.next_state
        ]).to(device).unsqueeze(1)

        values = self.Q(states).gather(1, actions).float()
        target_values[non_terminal_mask] += hparams.gamma * self.T(
            non_terminal_next_states).detach().max(1)[0].float()

        #print(values.dtype,target_values.dtype)
        loss = F.smooth_l1_loss(values, target_values)
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.Q.parameters():
            param.grad.data.clamp_(-1, 1)  # maybe try sign_?

        self.optimizer.step()
        return loss

Пример #3

Показать файл

Файл: model.py Проект: hai-h-nguyen/RL-algorithms-implementation

def train_dqn(env,
              num_steps,
              *,
              replay_size,
              batch_size,
              exploration,
              gamma,
              train_freq=1,
              print_freq=100,
              target_network_update_freq=500,
              t_learning_start=1000):
    """
    DQN algorithm.

    Compared to previous training procedures, we will train for a given number
    of time-steps rather than a given number of episodes.  The number of
    time-steps will be in the range of millions, which still results in many
    episodes being executed.

    Args:
        - env: The openai Gym environment
        - num_steps: Total number of steps to be used for training
        - replay_size: Maximum size of the ReplayMemory
        - batch_size: Number of experiences in a batch
        - exploration: a ExponentialSchedule
        - gamma: The discount factor

    Returns: (saved_models, returns)
        - saved_models: Dictionary whose values are trained DQN models
        - returns: Numpy array containing the return of each training episode
        - lengths: Numpy array containing the length of each training episode
        - losses: Numpy array containing the loss of each training batch
    """
    # check that environment states are compatible with our DQN representation
    assert (isinstance(env.observation_space, gym.spaces.Box)
            and len(env.observation_space.shape) == 1)

    # get the state_size from the environment
    state_size = env.observation_space.shape[0]

    # initialize the DQN and DQN-target models
    dqn_model = DQN(state_size, env.action_space.n)
    dqn_target = DQN.custom_load(dqn_model.custom_dump())

    # initialize the optimizer
    optimizer = torch.optim.Adam(dqn_model.parameters(), lr=5e-4)

    # initialize the replay memory
    memory = ReplayMemory(replay_size, state_size)

    # initiate lists to store returns, lengths and losses
    rewards = []
    returns = []
    lengths = []
    losses = []

    last_100_returns = deque(maxlen=100)
    last_100_lengths = deque(maxlen=100)

    # initiate structures to store the models at different stages of training
    saved_models = {}

    i_episode = 0
    t_episode = 0

    state = env.reset()

    # iterate for a total of `num_steps` steps
    for t_total in range(num_steps):
        # use t_total to indicate the time-step from the beginning of training

        if t_total >= t_learning_start:
            eps = exploration.value(t_total - t_learning_start)
        else:
            eps = 1.0
        action = select_action_epsilon_greedy(dqn_model, state, eps, env)
        next_state, reward, done, _ = env.step(action)
        memory.add(state, action, reward, next_state, done)

        rewards.append(reward)
        state = next_state

        if t_total >= t_learning_start and t_total % train_freq == 0:
            batch = memory.sample(batch_size)
            loss = train_dqn_batch(optimizer, batch, dqn_model, dqn_target,
                                   gamma)
            losses.append(loss)

        # update target network
        if t_total >= t_learning_start and t_total % target_network_update_freq == 0:
            dqn_target.load_state_dict(dqn_model.state_dict())

        if done:

            # Calculate episode returns
            G = 0
            for i in range(len(rewards)):
                G += rewards[i] * pow(gamma, i)

            # Collect results
            lengths.append(t_episode + 1)
            returns.append(G)

            last_100_returns.append(G)
            last_100_lengths.append(t_episode + 1)

            if i_episode % print_freq == 0:
                logger.record_tabular("time step", t_total)

                logger.record_tabular("episodes", i_episode)
                logger.record_tabular("step", t_episode + 1)
                logger.record_tabular("return", G)
                logger.record_tabular("mean reward", np.mean(last_100_returns))
                logger.record_tabular("mean length", np.mean(last_100_lengths))

                logger.record_tabular("% time spent exploring", int(100 * eps))
                logger.dump_tabular()

            # End of episode so reset time, reset rewards list
            t_episode = 0
            rewards = []

            # Environment terminated so reset it
            state = env.reset()

            # Increment the episode index
            i_episode += 1

        else:
            t_episode += 1

    return (
        dqn_model,
        np.array(returns),
        np.array(lengths),
        np.array(losses),
    )

Пример #4

Показать файл

class Learner(object):

	def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem):
		self.params = params
		self.param_set_id = param_set_id
		self.status_dict = status_dict
		self.shared_state = shared_state
		self.remote_mem = remote_mem

		gpu = 0
		torch.cuda.set_device(gpu)

		ep = params['env']
		ap = params['actor']
		lp = params['learner']
		rmp = params["replay_memory"]

		model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)'
		optimizer_formula = lp["optimizer"].format('self.Q.parameters()')

		self.conn = psycopg2.connect(params["db"]["connection_string"])
		self.conn.autocommit = True
		self.cur = self.conn.cursor()

		self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu")
		self.state_shape = ep['state_shape']
		self.batch_size = lp['replay_sample_size']
		self.action_dim = ep['action_dim']
		self.q_target_sync_freq = lp['q_target_sync_freq']
		self.num_q_updates = 0
		self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device)
		self.Q = eval(model_formula)
		self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q
		self.optimizer = eval(optimizer_formula)
		self.replay_memory = ReplayMemory(rmp)

		self.train_num = 0
		self.model_file_name = lp['load_saved_state']
		if self.model_file_name and os.path.isfile(self.model_file_name):
			print(f'Loading {self.model_file_name}')
			saved_state = torch.load(self.model_file_name)
			self.Q.load_state_dict(saved_state['module'])
			self.optimizer.load_state_dict(saved_state['optimizer'])
			self.train_num = saved_state['train_num']

		self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
		    self.Q_target.state_dict())
		self.status_dict['Q_state_dict_stored'] = True

		self.last_Q_state_dict_id = 1
		self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
		self.status_dict['train_num'] = self.train_num

		self.gamma_n = params['actor']['gamma']**params['actor']['num_steps']

	def state_dict_to_cpu(self, state_dict):
		d = OrderedDict()
		for k, v in state_dict.items():
			d[k] = v.cpu()
		return d

	def add_experience_to_replay_mem(self):
		while self.remote_mem.qsize():
			priorities, batch = self.remote_mem.get()
			self.replay_memory.add(priorities, batch)

	def compute_loss_and_priorities(self, batch_size):
		indices, n_step_transition_batch, before_priorities = self.replay_memory.sample(batch_size)

		s = n_step_transition_batch[0].to(self.device)
		a = n_step_transition_batch[1].to(self.device)
		r = n_step_transition_batch[2].to(self.device)
		a_latest = n_step_transition_batch[3].to(self.device)
		s_latest = n_step_transition_batch[4].to(self.device)
		terminal = n_step_transition_batch[5].to(self.device)

		q = self.Q(s)
		q_a = q.take(self.take_offsets + a).squeeze()

		with torch.no_grad():
			self.Q_target.eval()
			Gt = r + (1.0 - terminal) * self.gamma_n * self.Q_target(s_latest).take(self.take_offsets + a_latest).squeeze()
			td_error = Gt - q_a

		loss = F.smooth_l1_loss(q_a, Gt)
		# loss = td_error**2 / 2

		# Compute the new priorities of the experience
		after_priorities = td_error.data.abs().cpu().numpy()
		self.replay_memory.set_priorities(indices, after_priorities)

		return loss, q, before_priorities, after_priorities, indices

	def update_Q(self, loss):
		self.optimizer.zero_grad()
		loss.backward()
		self.optimizer.step()
		self.num_q_updates += 1

		if self.num_q_updates % self.q_target_sync_freq == 0:
			self.Q_target.load_state_dict(self.Q.state_dict())
			print(f'Target Q synchronized.')
			return True
		else:
			return False

	def learn(self):
		t = tables.LearnerData()
		record_type = t.get_record_type()
		record_insert = t.get_insert()
		cur = self.cur
		param_set_id = self.param_set_id
		now = datetime.datetime.now
		step_num = 0
		target_sync_num = 0
		send_param_num = 0
		min_replay_mem_size = self.params['learner']["min_replay_mem_size"]

		print('learner waiting for replay memory.')
		while self.replay_memory.size() <= min_replay_mem_size:
			self.add_experience_to_replay_mem()
			time.sleep(0.01)
		step_num = 0
		print('learner start')
		while not self.status_dict['quit']:
			self.add_experience_to_replay_mem()
			# 4. Sample a prioritized batch of transitions
			# 5. & 7. Apply double-Q learning rule, compute loss and experience priorities
			# 8. Update priorities
			loss, q, before_priorities, after_priorities, indices = self.compute_loss_and_priorities(self.batch_size)
			if step_num % 10 == 0:
				print(f'loss : {loss}')
			#print("\nLearner: step_num=", step_num, "loss:", loss, "RPM.size:", self.replay_memory.size(), end='\r')
			# 6. Update parameters of the Q network(s)
			if self.update_Q(loss):
				target_sync_num += 1
			if step_num % 5 == 0:
				self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
				    self.Q_target.state_dict())
				self.last_Q_state_dict_id += 1
				self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
				print('Send params to actors.')
				send_param_num += 1

			# 9. Periodically remove old experience from replay memory
			step_num += 1
			self.train_num += 1
			self.status_dict['train_num'] = self.train_num

			# DBへデータ登録
			r = record_type(param_set_id, now(), self.train_num,
			                step_num, loss.item(), q[0].tolist(), before_priorities.tolist(), after_priorities.tolist(),
			                indices.tolist(), target_sync_num, send_param_num)
			record_insert(cur, r)

		print('learner end')

		state_dict = {'module': self.Q.state_dict(), 'optimizer': self.optimizer.state_dict(), 'train_num': self.train_num}
		torch.save(state_dict, self.model_file_name)

Пример #5

Показать файл

class DDQN_separated_net(Agent_segment):
    def __init__(self, epsilon=0.3, memory_size=300, batch_size=16, model=navigation_model,
                 target_update_interval=1,
                 tau=0.005):
        super(DDQN_separated_net, self).__init__(epsilon=epsilon,
                                                 random_can_stop=False)

        # Memory
        self.memory = ReplayMemory(memory_size)

        # Batch size when learning
        self.batch_size = batch_size

        # number of time steps before an update of the delayed target Q network
        self.target_update_interval = target_update_interval

        # soft update weight of the delayed Q network
        self.tau = tau

    def learned_act(self, s, pred_oracle=True, online=False):
        if online:
            if pred_oracle:
                return torch.cat([self.model(s), oracle(s).unsqueeze(1)], 1)
        with torch.no_grad():
            if pred_oracle:
                return torch.cat([self.target_model(s), oracle(s).unsqueeze(1)], 1)
                # to do without oracle

    def reinforce(self, s_, a_, n_s_, r_, game_over_, env_steps_):
        # Two steps: first memorize the states, second learn from the pool

        self.memory.remember(s_, a_, n_s_, r_, game_over_)

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)


        # non_final_mask = torch.tensor(torch.cat(batch.game_over), device=device)==False
        non_final_mask = torch.cat(batch.game_over) == False

        non_final_next_states = torch.cat(batch.next_state)[non_final_mask]
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action).view(-1, 2)
        reward_batch = torch.cat(batch.reward)
        # non_final_next_states = torch.cat(batch.next_state)[non_final_index]

        # print(state_batch.shape)
        state_values = self.learned_act(state_batch, online=True)
        state_action_values = torch.cat(
            [s[a[0].item(), a[1].item()].unsqueeze(0) for s, a in zip(state_values, batch.action)])

        next_state_values = torch.zeros(self.batch_size, device=device)

        if len(non_final_next_states) > 0:
            with torch.no_grad():
                argmax_online = (self.learned_act(non_final_next_states, online=True)).view(non_final_next_states.shape[0],-1).argmax(1)
                # print(torch.tensor(range(self.batch_size), device=device)[non_final_mask])
                # print(self.learned_act(non_final_next_states, online=False).view(-1, 2*SEGMENT_LENGTH).shape)
                next_state_values[non_final_mask] = \
                self.learned_act(non_final_next_states, online=False).view(non_final_next_states.shape[0], -1)[
                    range(len(non_final_next_states)), argmax_online]

        expected_state_action_values = next_state_values + reward_batch

        loss = F.smooth_l1_loss(state_action_values[non_final_mask],
                                expected_state_action_values[non_final_mask])  # .unsqueeze(1))
        # loss = F.mse_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask])

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter
            param.grad.data.clamp_(-1e-6, 1e-6)
        self.optimizer.step()

        if env_steps_ % self.target_update_interval == 0:
            soft_update(self.target_model, self.model, self.tau)

        return float(loss)

    def save_model(self, model_path='model.pickle'):
        try:
            torch.save(self.model, model_path)
        except:
            pass

    def load_model(self, model_path='model.pickle', local=True):
        if local:
            self.model = navigation_model()
            self.target_model = navigation_model()
            hard_update(self.target_model, self.model)
        else:
            self.model = torch.load('model.pickle')
            self.target_model = torch.load('model.pickle')
        if torch.cuda.is_available():
            print('Using GPU')
            self.model.cuda()
            self.target_model.cuda()
        else:
            print('Using CPU')
        self.optimizer = optim.RMSprop(self.model.parameters(), lr=1e-5)