Python DQN.cuda примеры использования

Язык программирования: Python

Пространство имен/Пакет: model

Класс/Тип: DQN

Метод/Функция: cuda

Примеров на hotexamples.com: 18

Python DQN.cuda - 18 примеров найдено. Это лучшие примеры Python кода для model.DQN.cuda, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DQN(30)

load_state_dict(30)

parameters(30)

train(30)

eval(30)

state_dict(30)

get_action(25)

init_state(20)

remember(19)

cuda(18)

reset_noise(14)

update_target_network(13)

zero_grad(12)

save(12)

load(11)

act(10)

to(8)

predict(8)

sample_noise(5)

forward(5)

update_noisy_modules(4)

update(4)

sample_action(3)

chooseAction(2)

update_target(2)

share_memory(2)

save_model(2)

initState(2)

getAction(2)

learn(2)

optimize(2)

perceive(1)

train_Xy(1)

store_path(1)

store_transition(1)

copy2target(1)

to_gpu(1)

choose_action(1)

train_net(1)

step(1)

train_step(1)

memory(1)

updateTargetNetwork(1)

fit(1)

forward_with_latent(1)

apply(1)

store(1)

copy_from(1)

pick_action(1)

reset_model(1)

Пример #1

Показать файл

def main():
    # Create carpole environment and network
    env = gym.make('CartPole-v0').unwrapped
    if not os.path.exists(model_path):
        raise Exception("You should train the DQN first!")
    net = DQN(n_state=env.observation_space.shape[0],
              n_action=env.action_space.n,
              epsilon=epsilon,
              batch_size=batch_size,
              model_path=model_path)
    net.load()
    net.cuda()
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        while True:
            # env.render()

            # Select action and obtain the reward
            a = net.chooseAction(s)
            s_, r, finish, _ = env.step(a)

            total_reward += r
            if finish:
                print("Episode: %d \t Total reward: %d \t Eps: %f" %
                      (i, total_reward, net.epsilon))
                reward_list.append(total_reward)
                break
            s = s_
    env.close()
    print("Testing average reward: ", np.mean(reward_list))

Пример #2

Показать файл

Файл: agent.py Проект: Ashutosh-Adhikari/Rainbow

class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max,
                                      args.atoms)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space)
        if args.model and os.path.isfile(args.model):
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
        if args.cuda:
            self.online_net.cuda()
            self.target_net.cuda()
            self.support = self.support.cuda()

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[1][0]

    # Acts with an ε-greedy policy
    def act_e_greedy(self, state, epsilon=0.001):
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        # Calculate current state probabilities
        self.online_net.reset_noise()  # Sample new noise for online network
        ps = self.online_net(states)  # Probabilities p(s_t, ·; θonline)
        ps_a = ps[range(self.batch_size), actions]  # p(s_t, a_t; θonline)

        # Calculate nth next state probabilities
        self.online_net.reset_noise()  # Sample new noise for action selection
        pns = self.online_net(
            next_states).data  # Probabilities p(s_t+n, ·; θonline)
        dns = self.support.expand_as(
            pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
        argmax_indices_ns = dns.sum(2).max(
            1
        )[1]  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
        self.target_net.reset_noise()  # Sample new target net noise
        pns = self.target_net(
            next_states).data  # Probabilities p(s_t+n, ·; θtarget)
        pns_a = pns[range(
            self.batch_size
        ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

        # Compute Tz (Bellman operator T applied to z)
        Tz = returns.unsqueeze(1) + nonterminals * (
            self.discount**self.n) * self.support.unsqueeze(
                0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
        Tz = Tz.clamp(min=self.Vmin,
                      max=self.Vmax)  # Clamp between supported values
        # Compute L2 projection of Tz onto fixed support z
        b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
        l, u = b.floor().long(), b.ceil().long()
        # Fix disappearing probability mass when l = b = u (b is int)
        l[(u > 0) * (l == u)] -= 1
        u[(l < (self.atoms - 1)) * (l == u)] += 1

        # Distribute probability of Tz
        m = states.data.new(self.batch_size, self.atoms).zero_()
        offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                self.batch_size).unsqueeze(1).expand(
                                    self.batch_size,
                                    self.atoms).type_as(actions)
        m.view(-1).index_add_(
            0, (l + offset).view(-1),
            (pns_a *
             (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
        m.view(-1).index_add_(
            0, (u + offset).view(-1),
            (pns_a *
             (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        ps_a = ps_a.clamp(min=1e-3)  # Clamp for numerical stability in log
        loss = -torch.sum(
            Variable(m) * ps_a.log(),
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        (weights * loss).mean().backward()  # Importance weight losses
        self.optimiser.step()

        mem.update_priorities(
            idxs, loss.data)  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def save(self, path):
        torch.save(self.online_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[0][0]

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()

Пример #3

Показать файл

Файл: agent.py Проект: renly/Rainbow

class Agent():
  def __init__(self, args, env):
    self.action_space = env.action_space()
    self.atoms = args.atoms
    self.Vmin = args.V_min
    self.Vmax = args.V_max
    self.support = torch.linspace(args.V_min, args.V_max, args.atoms)  # Support (range) of z
    self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
    self.batch_size = args.batch_size
    self.n = args.multi_step
    self.discount = args.discount
    self.priority_exponent = args.priority_exponent
    self.max_gradient_norm = args.max_gradient_norm

    self.policy_net = DQN(args, self.action_space)
    if args.model and os.path.isfile(args.model):
      self.policy_net.load_state_dict(torch.load(args.model))
    self.policy_net.train()

    self.target_net = DQN(args, self.action_space)
    self.update_target_net()
    self.target_net.eval()

    self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps)
    if args.cuda:
      self.policy_net.cuda()
      self.target_net.cuda()
      self.support = self.support.cuda()

  # Resets noisy weights in all linear layers (of policy and target nets)
  def reset_noise(self):
    self.policy_net.reset_noise()
    self.target_net.reset_noise()

  # Acts based on single state (no batch)
  def act(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0]

  def learn(self, mem):
    idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(self.batch_size)
    batch_size = len(idxs)  # May return less than specified if invalid transitions sampled

    # Calculate current state probabilities
    ps = self.policy_net(states)  # Probabilities p(s_t, ·; θpolicy)
    ps_a = ps[range(batch_size), actions]  # p(s_t, a_t; θpolicy)

    # Calculate nth next state probabilities
    pns = self.policy_net(next_states).data  # Probabilities p(s_t+n, ·; θpolicy)
    dns = self.support.expand_as(pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θpolicy))
    argmax_indices_ns = dns.sum(2).max(1)[1]  # Perform argmax action selection using policy network: argmax_a[(z, p(s_t+n, a; θpolicy))]
    pns = self.target_net(next_states).data  # Probabilities p(s_t+n, ·; θtarget)
    pns_a = pns[range(batch_size), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θpolicy))]; θtarget)
    pns_a *= nonterminals  # Set p = 0 for terminal nth next states as all possible expected returns = expected reward at final transition

    # Compute Tz (Bellman operator T applied to z)
    Tz = returns.unsqueeze(1) + nonterminals * (self.discount ** self.n) * self.support.unsqueeze(0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
    Tz = Tz.clamp(min=self.Vmin, max=self.Vmax)  # Clamp between supported values
    # Compute L2 projection of Tz onto fixed support z
    b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
    l, u = b.floor().long(), b.ceil().long()

    # Distribute probability of Tz
    m = states.data.new(batch_size, self.atoms).zero_()
    offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).long().unsqueeze(1).expand(batch_size, self.atoms).type_as(actions)
    m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
    m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

    loss = -torch.sum(Variable(m) * ps_a.log(), 1)  # Cross-entropy loss (minimises Kullback-Leibler divergence)
    self.policy_net.zero_grad()
    (weights * loss).mean().backward()  # Importance weight losses
    nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm)  # Clip gradients (normalising by max value of gradient L2 norm)
    self.optimiser.step()

    mem.update_priorities(idxs, loss.data.abs().pow(self.priority_exponent))  # Update priorities of sampled transitions

  def update_target_net(self):
    self.target_net.load_state_dict(self.policy_net.state_dict())

  def save(self, path):
    torch.save(self.policy_net.state_dict(), os.path.join(path, 'model.pth'))

  # Evaluates Q-value based on single state (no batch)
  def evaluate_q(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0]

  def train(self):
    self.policy_net.train()

  def eval(self):
    self.policy_net.eval()

Пример #4

Показать файл

Файл: main_idea_0.py Проект: ylz241518/unified-hrl

dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
) else torch.FloatTensor
dlongtype = torch.cuda.LongTensor if torch.cuda.is_available(
) else torch.LongTensor
duinttype = torch.cuda.ByteTensor if torch.cuda.is_available(
) else torch.ByteTensor

Qt = DQN(in_channels=5, num_actions=18).type(dtype)
Qt_t = DQN(in_channels=5, num_actions=18).type(dtype)
Qt_t.load_state_dict(Qt.state_dict())
Qt_t.eval()
for param in Qt_t.parameters():
    param.requires_grad = False

if torch.cuda.device_count() > 0:
    Qt.cuda()
    Qt = nn.DataParallel(Qt).to(device0)
    Qt_t = nn.DataParallel(Qt_t).to(device0)
    batch_size = BATCH_SIZE * torch.cuda.device_count()
else:
    batch_size = BATCH_SIZE

# optimizer
optimizer = optim.RMSprop(Qt.parameters(),
                          lr=LEARNING_RATE,
                          alpha=ALPHA,
                          eps=EPS)

# training parameters
# Create environment
import gym

Пример #5

Показать файл

Файл: main.py Проект: ktkth5/reinforcement_learning

def train():
    # global args
    # args = parser.parse_args()
    Learner = DQN().to(device)

    env = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1')
    # env = retro.make(game='Airstriker-Genesis', state='Level1')

    criterion = L2_loss(0.99).to(device)

    if is_cuda:
        Learner = Learner.cuda()
        criterion = criterion.cuda()

    optimizer = optim.SGD(Learner.parameters(), lr=0.01)

    eps_threshold = 0.8
    RM = ReplayMemory(1000)
    A_agent = ActorAgent(Learner, args)
    print("Start Episodes")
    for i_episode in range(50000):
        env.reset()
        A_agent.reset(Learner, args)
        last_state = get_screen(env)
        current_state = get_screen(env)
        state = current_state - last_state
        # state_var = torch.autograd.Variable(state)
        state_var = state.to(device)
        total_reward = 0
        if i_episode % 50 == 0:
            eps_threshold = 0.9
        for t in count():
            if t == 0:
                print("episode begin")
            eps_threshold -= 0.000019
            action_q = A_agent.act(state_var, eps_threshold)
            """
            if is_cuda:
                action_q = action_q.cpu()
                _, action = action_q.data.max(2)
            else:
                _, action = action_q.data.max(2)
            """
            _, action = action_q.data.max(2)

            action_numpy = action.squeeze(0).numpy()
            # print(list(action_numpy))
            for i in range(4):
                _, reward, done, _ = env.step(action_numpy)
                total_reward += reward
            last_state = current_state
            current_state = get_screen(env)
            state = current_state - last_state
            # state_var = torch.autograd.Variable(state)
            state_var = state.to(device)
            # 行動語のstateを保存
            A_agent.add_to_buffer(reward, action_q, state_var)

            # ReplayMemoryに状態保存
            if len(A_agent.localbuffer) > 10:
                p, error = calc_priority_TDerror(Learner, criterion, A_agent,
                                                 10)

                RM.push(p, error)

            if done:
                break

            # Optimize Learner model
            # if t%100==0 and len(A_agent.localbuffer)>80 and len(RM)>=30:
        for i in range(4):
            error_batch = RM.priority_sample(30)

            optimizer.zero_grad()
            # error_batch.backward(retain_graph=True)
            error_batch.backward()
            optimizer.step()
            for param in Learner.parameters():
                param.grad.data.clamp_(-1, 1)
            optimizer.step()
            print("{0}\t{1}\tLoss:{2}\tTotal:{3}\tReward:{4}".format(
                i_episode,
                t,
                float(error_batch),
                total_reward,
                reward,
            ))
        RM.reset()
        # env.render()

        with open("total_reward.txt", "a") as f:
            f.write("{0}\t{1}".format(i_episode, total_reward))
            f.write("\n")

Пример #6

Показать файл

Файл: agent.py Проект: yyht/qait_public

class Agent:
    def __init__(self):
        self.mode = "train"
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)
        print(self.config)
        self.load_config()

        self.online_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.target_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.online_net.train()
        self.target_net.train()
        self.update_target_net()
        for param in self.target_net.parameters():
            param.requires_grad = False

        if self.use_cuda:
            self.online_net.cuda()
            self.target_net.cuda()

        self.naozi = ObservationPool(capacity=self.naozi_capacity)
        # optimizer
        self.optimizer = torch.optim.Adam(
            self.online_net.parameters(),
            lr=self.config['training']['optimizer']['learning_rate'])
        self.clip_grad_norm = self.config['training']['optimizer'][
            'clip_grad_norm']

    def load_config(self):
        # word vocab
        with open("vocabularies/word_vocab.txt") as f:
            self.word_vocab = f.read().split("\n")
        self.word2id = {}
        for i, w in enumerate(self.word_vocab):
            self.word2id[w] = i
        # char vocab
        with open("vocabularies/char_vocab.txt") as f:
            self.char_vocab = f.read().split("\n")
        self.char2id = {}
        for i, w in enumerate(self.char_vocab):
            self.char2id[w] = i

        self.EOS_id = self.word2id["</s>"]
        self.train_data_size = self.config['general']['train_data_size']
        self.question_type = self.config['general']['question_type']
        self.random_map = self.config['general']['random_map']
        self.testset_path = self.config['general']['testset_path']
        self.naozi_capacity = self.config['general']['naozi_capacity']
        self.eval_folder = pjoin(
            self.testset_path, self.question_type,
            ("random_map" if self.random_map else "fixed_map"))
        self.eval_data_path = pjoin(self.testset_path, "data.json")

        self.batch_size = self.config['training']['batch_size']
        self.max_nb_steps_per_episode = self.config['training'][
            'max_nb_steps_per_episode']
        self.max_episode = self.config['training']['max_episode']
        self.target_net_update_frequency = self.config['training'][
            'target_net_update_frequency']
        self.learn_start_from_this_episode = self.config['training'][
            'learn_start_from_this_episode']

        self.run_eval = self.config['evaluate']['run_eval']
        self.eval_batch_size = self.config['evaluate']['batch_size']
        self.eval_max_nb_steps_per_episode = self.config['evaluate'][
            'max_nb_steps_per_episode']

        # Set the random seed manually for reproducibility.
        self.random_seed = self.config['general']['random_seed']
        np.random.seed(self.random_seed)
        torch.manual_seed(self.random_seed)
        if torch.cuda.is_available():
            if not self.config['general']['use_cuda']:
                print(
                    "WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml"
                )
                self.use_cuda = False
            else:
                torch.backends.cudnn.deterministic = True
                torch.cuda.manual_seed(self.random_seed)
                self.use_cuda = True
        else:
            self.use_cuda = False

        if self.question_type == "location":
            self.answer_type = "pointing"
        elif self.question_type in ["attribute", "existence"]:
            self.answer_type = "2 way"
        else:
            raise NotImplementedError

        self.save_checkpoint = self.config['checkpoint']['save_checkpoint']
        self.experiment_tag = self.config['checkpoint']['experiment_tag']
        self.save_frequency = self.config['checkpoint']['save_frequency']
        self.load_pretrained = self.config['checkpoint']['load_pretrained']
        self.load_from_tag = self.config['checkpoint']['load_from_tag']

        self.qa_loss_lambda = self.config['training']['qa_loss_lambda']
        self.interaction_loss_lambda = self.config['training'][
            'interaction_loss_lambda']

        # replay buffer and updates
        self.discount_gamma = self.config['replay']['discount_gamma']
        self.replay_batch_size = self.config['replay']['replay_batch_size']
        self.command_generation_replay_memory = command_generation_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=self.config['replay']
            ['replay_memory_priority_fraction'],
            discount_gamma=self.discount_gamma)
        self.qa_replay_memory = qa_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=0.0)
        self.update_per_k_game_steps = self.config['replay'][
            'update_per_k_game_steps']
        self.multi_step = self.config['replay']['multi_step']

        # distributional RL
        self.use_distributional = self.config['distributional']['enable']
        self.atoms = self.config['distributional']['atoms']
        self.v_min = self.config['distributional']['v_min']
        self.v_max = self.config['distributional']['v_max']
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atoms)  # Support (range) of z
        if self.use_cuda:
            self.support = self.support.cuda()
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)

        # dueling networks
        self.dueling_networks = self.config['dueling_networks']

        # double dqn
        self.double_dqn = self.config['double_dqn']

        # counting reward
        self.revisit_counting_lambda_anneal_episodes = self.config[
            'episodic_counting_bonus'][
                'revisit_counting_lambda_anneal_episodes']
        self.revisit_counting_lambda_anneal_from = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_from']
        self.revisit_counting_lambda_anneal_to = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_to']
        self.revisit_counting_lambda = self.revisit_counting_lambda_anneal_from

        # valid command bonus
        self.valid_command_bonus_lambda = self.config[
            'valid_command_bonus_lambda']

        # epsilon greedy
        self.epsilon_anneal_episodes = self.config['epsilon_greedy'][
            'epsilon_anneal_episodes']
        self.epsilon_anneal_from = self.config['epsilon_greedy'][
            'epsilon_anneal_from']
        self.epsilon_anneal_to = self.config['epsilon_greedy'][
            'epsilon_anneal_to']
        self.epsilon = self.epsilon_anneal_from
        self.noisy_net = self.config['epsilon_greedy']['noisy_net']
        if self.noisy_net:
            # disable epsilon greedy
            self.epsilon_anneal_episodes = -1
            self.epsilon = 0.0

        self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
        self.single_word_verbs = set(["inventory", "look", "wait"])
        self.two_word_verbs = set(["go"])

    def train(self):
        """
        Tell the agent that it's training phase.
        """
        self.mode = "train"
        self.online_net.train()

    def eval(self):
        """
        Tell the agent that it's evaluation phase.
        """
        self.mode = "eval"
        self.online_net.eval()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def reset_noise(self):
        if self.noisy_net:
            # Resets noisy weights in all linear layers (of online net only)
            self.online_net.reset_noise()

    def zero_noise(self):
        if self.noisy_net:
            self.online_net.zero_noise()
            self.target_net.zero_noise()

    def load_pretrained_model(self, load_from):
        """
        Load pretrained checkpoint from file.

        Arguments:
            load_from: File name of the pretrained model checkpoint.
        """
        print("loading model from %s\n" % (load_from))
        try:
            if self.use_cuda:
                state_dict = torch.load(load_from)
            else:
                state_dict = torch.load(load_from, map_location='cpu')
            self.online_net.load_state_dict(state_dict)
        except:
            print("Failed to load checkpoint...")

    def save_model_to_path(self, save_to):
        torch.save(self.online_net.state_dict(), save_to)
        print("Saved checkpoint to %s..." % (save_to))

    def init(self, obs, infos):
        """
        Prepare the agent for the upcoming games.

        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        # reset agent, get vocabulary masks for verbs / adjectives / nouns
        batch_size = len(obs)
        self.reset_binarized_counter(batch_size)
        self.not_finished_yet = np.ones((batch_size, ), dtype="float32")
        self.prev_actions = [["" for _ in range(batch_size)]]
        self.prev_step_is_still_interacting = np.ones(
            (batch_size, ), dtype="float32"
        )  # 1s and starts to be 0 when previous action is "wait"
        self.naozi.reset(batch_size=batch_size)

    def get_agent_inputs(self, string_list):
        sentence_token_list = [item.split() for item in string_list]
        sentence_id_list = [
            _words_to_ids(tokens, self.word2id)
            for tokens in sentence_token_list
        ]
        input_sentence_char = list_of_token_list_to_char_input(
            sentence_token_list, self.char2id)
        input_sentence = pad_sequences(
            sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32')
        input_sentence = to_pt(input_sentence, self.use_cuda)
        input_sentence_char = to_pt(input_sentence_char, self.use_cuda)
        return input_sentence, input_sentence_char, sentence_id_list

    def get_game_info_at_certain_step(self, obs, infos):
        """
        Get all needed info from game engine for training.
        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        batch_size = len(obs)
        feedback_strings = [preproc(item, tokenizer=self.nlp) for item in obs]
        description_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["description"]
        ]
        observation_strings = [
            d + " <|> " + fb if fb != d else d + " <|> hello"
            for fb, d in zip(feedback_strings, description_strings)
        ]

        inventory_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["inventory"]
        ]
        local_word_list = [
            obs.split() + inv.split()
            for obs, inv in zip(observation_strings, inventory_strings)
        ]

        directions = ["east", "west", "north", "south"]
        if self.question_type in ["location", "existence"]:
            # agents observes the env, but do not change them
            possible_verbs = [["go", "inventory", "wait", "open", "examine"]
                              for _ in range(batch_size)]
        else:
            possible_verbs = [
                list(set(item) - set(["", "look"])) for item in infos["verbs"]
            ]

        possible_adjs, possible_nouns = [], []
        for i in range(batch_size):
            object_nouns = [
                item.split()[-1] for item in infos["object_nouns"][i]
            ]
            object_adjs = [
                w for item in infos["object_adjs"][i] for w in item.split()
            ]
            possible_nouns.append(
                list(set(object_nouns) & set(local_word_list[i]) - set([""])) +
                directions)
            possible_adjs.append(
                list(set(object_adjs) & set(local_word_list[i]) - set([""])) +
                ["</s>"])

        return observation_strings, [
            possible_verbs, possible_adjs, possible_nouns
        ]

    def get_state_strings(self, infos):
        description_strings = infos["description"]
        inventory_strings = infos["inventory"]
        observation_strings = [
            _d + _i for (_d, _i) in zip(description_strings, inventory_strings)
        ]
        return observation_strings

    def get_local_word_masks(self, possible_words):
        possible_verbs, possible_adjs, possible_nouns = possible_words
        batch_size = len(possible_verbs)

        verb_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        noun_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        adj_mask = np.zeros((batch_size, len(self.word_vocab)),
                            dtype="float32")
        for i in range(batch_size):
            for w in possible_verbs[i]:
                if w in self.word2id:
                    verb_mask[i][self.word2id[w]] = 1.0
            for w in possible_adjs[i]:
                if w in self.word2id:
                    adj_mask[i][self.word2id[w]] = 1.0
            for w in possible_nouns[i]:
                if w in self.word2id:
                    noun_mask[i][self.word2id[w]] = 1.0
        adj_mask[:, self.EOS_id] = 1.0

        return [verb_mask, adj_mask, noun_mask]

    def get_match_representations(self,
                                  input_observation,
                                  input_observation_char,
                                  input_quest,
                                  input_quest_char,
                                  use_model="online"):
        model = self.online_net if use_model == "online" else self.target_net
        description_representation_sequence, description_mask = model.representation_generator(
            input_observation, input_observation_char)
        quest_representation_sequence, quest_mask = model.representation_generator(
            input_quest, input_quest_char)

        match_representation_sequence = model.get_match_representations(
            description_representation_sequence, description_mask,
            quest_representation_sequence, quest_mask)
        match_representation_sequence = match_representation_sequence * description_mask.unsqueeze(
            -1)
        return match_representation_sequence

    def get_ranks(self,
                  input_observation,
                  input_observation_char,
                  input_quest,
                  input_quest_char,
                  word_masks,
                  use_model="online"):
        """
        Given input observation and question tensors, to get Q values of words.
        """
        model = self.online_net if use_model == "online" else self.target_net
        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        action_ranks = model.action_scorer(match_representation_sequence,
                                           word_masks)  # list of 3 tensors
        return action_ranks

    def choose_maxQ_command(self, action_ranks, word_mask=None):
        """
        Generate a command by maximum q values, for epsilon greedy.
        """
        if self.use_distributional:
            action_ranks = [
                (item * self.support).sum(2) for item in action_ranks
            ]  # list of batch x n_vocab
        action_indices = []
        for i in range(len(action_ranks)):
            ar = action_ranks[i]
            ar = ar - torch.min(
                ar, -1, keepdim=True
            )[0] + 1e-2  # minus the min value, so that all values are non-negative
            if word_mask is not None:
                assert word_mask[i].size() == ar.size(), (
                    word_mask[i].size().shape, ar.size())
                ar = ar * word_mask[i]
            action_indices.append(torch.argmax(ar, -1))  # batch
        return action_indices

    def choose_random_command(self,
                              batch_size,
                              action_space_size,
                              possible_words=None):
        """
        Generate a command randomly, for epsilon greedy.
        """
        action_indices = []
        for i in range(3):
            if possible_words is None:
                indices = np.random.choice(action_space_size, batch_size)
            else:
                indices = []
                for j in range(batch_size):
                    mask_ids = []
                    for w in possible_words[i][j]:
                        if w in self.word2id:
                            mask_ids.append(self.word2id[w])
                    indices.append(np.random.choice(mask_ids))
                indices = np.array(indices)
            action_indices.append(to_pt(indices, self.use_cuda))  # batch
        return action_indices

    def get_chosen_strings(self, chosen_indices):
        """
        Turns list of word indices into actual command strings.
        chosen_indices: Word indices chosen by model.
        """
        chosen_indices_np = [to_np(item) for item in chosen_indices]
        res_str = []
        batch_size = chosen_indices_np[0].shape[0]
        for i in range(batch_size):
            verb, adj, noun = chosen_indices_np[0][i], chosen_indices_np[1][
                i], chosen_indices_np[2][i]
            res_str.append(self.word_ids_to_commands(verb, adj, noun))
        return res_str

    def word_ids_to_commands(self, verb, adj, noun):
        """
        Turn the 3 indices into actual command strings.

        Arguments:
            verb: Index of the guessing verb in vocabulary
            adj: Index of the guessing adjective in vocabulary
            noun: Index of the guessing noun in vocabulary
        """
        # turns 3 indices into actual command strings
        if self.word_vocab[verb] in self.single_word_verbs:
            return self.word_vocab[verb]
        if self.word_vocab[verb] in self.two_word_verbs:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        if adj == self.EOS_id:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        else:
            return " ".join([
                self.word_vocab[verb], self.word_vocab[adj],
                self.word_vocab[noun]
            ])

    def act_random(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        with torch.no_grad():
            batch_size = len(obs)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)
            chosen_indices = word_indices_random
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act_greedy(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            batch_size = len(obs)
            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            chosen_indices = word_indices_maxq
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act(self,
            obs,
            infos,
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            possible_words,
            random=False):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            if self.mode == "eval":
                return self.act_greedy(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            if random:
                return self.act_random(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            batch_size = len(obs)

            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)

            # random number for epsilon greedy
            rand_num = np.random.uniform(low=0.0,
                                         high=1.0,
                                         size=(batch_size, ))
            less_than_epsilon = (rand_num < self.epsilon).astype(
                "float32")  # batch
            greater_than_epsilon = 1.0 - less_than_epsilon
            less_than_epsilon = to_pt(less_than_epsilon,
                                      self.use_cuda,
                                      type='long')
            greater_than_epsilon = to_pt(greater_than_epsilon,
                                         self.use_cuda,
                                         type='long')
            chosen_indices = [
                less_than_epsilon * idx_random +
                greater_than_epsilon * idx_maxq
                for idx_random, idx_maxq in zip(word_indices_random,
                                                word_indices_maxq)
            ]
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def get_dqn_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.command_generation_replay_memory) < self.replay_batch_size:
            return None

        data = self.command_generation_replay_memory.get_batch(
            self.replay_batch_size, self.multi_step)
        if data is None:
            return None

        obs_list, quest_list, possible_words_list, chosen_indices, rewards, next_obs_list, next_possible_words_list, actual_n_list = data
        batch_size = len(actual_n_list)

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, _ = self.get_agent_inputs(
            obs_list)
        next_input_observation, next_input_observation_char, _ = self.get_agent_inputs(
            next_obs_list)

        possible_words, next_possible_words = [], []
        for i in range(3):
            possible_words.append([item[i] for item in possible_words_list])
            next_possible_words.append(
                [item[i] for item in next_possible_words_list])

        local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(possible_words)
        ]
        next_local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(next_possible_words)
        ]

        action_ranks = self.get_ranks(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            local_word_masks,
            use_model="online"
        )  # list of batch x vocab or list of batch x vocab x atoms
        # ps_a
        word_qvalues = [
            ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1)
            for w_rank, idx in zip(action_ranks, chosen_indices)
        ]  # list of batch or list of batch x atoms
        q_value = torch.mean(torch.stack(word_qvalues, -1),
                             -1)  # batch or batch x atoms
        # log_ps_a
        log_q_value = torch.log(q_value)  # batch or batch x atoms

        with torch.no_grad():
            if self.noisy_net:
                self.target_net.reset_noise()  # Sample new target net noise
            if self.double_dqn:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="online")
                # list of batch x vocab or list of batch x vocab x atoms
                # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                # pns # Probabilities p(s_t+n, ·; θtarget)
                next_action_ranks = self.get_ranks(
                    next_input_observation,
                    next_input_observation_char,
                    input_quest,
                    input_quest_char,
                    next_local_word_masks,
                    use_model="target"
                )  # batch x vocab or list of batch x vocab x atoms
                # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms
            else:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="target")
                # list of batch x vocab or list of batch x vocab x atoms
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms

            next_q_value = torch.mean(torch.stack(next_word_qvalues, -1),
                                      -1)  # batch or batch x atoms
            # Compute Tz (Bellman operator T applied to z)
            discount = to_pt((np.ones_like(actual_n_list) *
                              self.discount_gamma)**actual_n_list,
                             self.use_cuda,
                             type="float")
        if not self.use_distributional:
            rewards = rewards + next_q_value * discount  # batch
            loss = F.smooth_l1_loss(q_value, rewards)
            return loss

        with torch.no_grad():
            Tz = rewards.unsqueeze(
                -1) + discount.unsqueeze(-1) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.v_min,
                          max=self.v_max)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.v_min) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = torch.zeros(batch_size, self.atoms).float()
            if self.use_cuda:
                m = m.cuda()
            offset = torch.linspace(0, ((batch_size - 1) * self.atoms),
                                    batch_size).unsqueeze(1).expand(
                                        batch_size, self.atoms).long()
            if self.use_cuda:
                offset = offset.cuda()
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (next_q_value *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (next_q_value *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_q_value,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        loss = torch.mean(loss)
        return loss

    def update_interaction(self):
        # update neural model by replaying snapshots in replay memory
        interaction_loss = self.get_dqn_loss()
        if interaction_loss is None:
            return None
        loss = interaction_loss * self.interaction_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(interaction_loss))

    def answer_question(self,
                        input_observation,
                        input_observation_char,
                        observation_id_list,
                        input_quest,
                        input_quest_char,
                        use_model="online"):
        # first pad answerer_input, and get the mask
        model = self.online_net if use_model == "online" else self.target_net
        batch_size = len(observation_id_list)
        max_length = input_observation.size(1)
        mask = compute_mask(input_observation)  # batch x obs_len

        # noun mask for location question
        if self.question_type in ["location"]:
            location_mask = []
            for i in range(batch_size):
                m = [1 for item in observation_id_list[i]]
                location_mask.append(m)
            location_mask = pad_sequences(location_mask,
                                          maxlen=max_length,
                                          dtype="float32")
            location_mask = to_pt(location_mask,
                                  enable_cuda=self.use_cuda,
                                  type='float')
            assert mask.size() == location_mask.size()
            mask = mask * location_mask

        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        pred = model.answer_question(match_representation_sequence,
                                     mask)  # batch x vocab or batch x 2

        # attention sum:
        # sometimes certain word appears multiple times in the observation,
        # thus we need to merge them together before doing further computations
        # ------- but
        # if answer type is not pointing, we just use a pre-defined mapping
        # that maps 0/1 to their positions in vocab
        if self.answer_type == "2 way":
            observation_id_list = []
            max_length = 2
            for i in range(batch_size):
                observation_id_list.append(
                    [self.word2id["0"], self.word2id["1"]])

        observation = to_pt(
            pad_sequences(observation_id_list,
                          maxlen=max_length).astype('int32'), self.use_cuda)
        vocab_distribution = np.zeros(
            (batch_size, len(self.word_vocab)))  # batch x vocab
        vocab_distribution = to_pt(vocab_distribution,
                                   self.use_cuda,
                                   type='float')
        vocab_distribution = vocab_distribution.scatter_add_(
            1, observation, pred)  # batch x vocab
        non_zero_words = []
        for i in range(batch_size):
            non_zero_words.append(list(set(observation_id_list[i])))
        vocab_mask = torch.ne(vocab_distribution, 0).float()
        return vocab_distribution, non_zero_words, vocab_mask

    def point_maxq_position(self, vocab_distribution, mask):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position (mapped to vocab).
            mask: vocab masks.
        """
        vocab_distribution = vocab_distribution - torch.min(
            vocab_distribution, -1, keepdim=True
        )[0] + 1e-2  # minus the min value, so that all values are non-negative
        vocab_distribution = vocab_distribution * mask  # batch x vocab
        indices = torch.argmax(vocab_distribution, -1)  # batch
        return indices

    def answer_question_act_greedy(self, input_observation,
                                   input_observation_char, observation_id_list,
                                   input_quest, input_quest_char):

        with torch.no_grad():
            vocab_distribution, _, vocab_mask = self.answer_question(
                input_observation,
                input_observation_char,
                observation_id_list,
                input_quest,
                input_quest_char,
                use_model="online")  # batch x time
            positions_maxq = self.point_maxq_position(vocab_distribution,
                                                      vocab_mask)
            return positions_maxq  # batch

    def get_qa_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.qa_replay_memory) < self.replay_batch_size:
            return None
        transitions = self.qa_replay_memory.sample(self.replay_batch_size)
        batch = qa_memory.qa_Transition(*zip(*transitions))

        observation_list = batch.observation_list
        quest_list = batch.quest_list
        answer_strings = batch.answer_strings
        answer_position = np.array(_words_to_ids(answer_strings, self.word2id))
        groundtruth = to_pt(answer_position, self.use_cuda)  # batch

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, observation_id_list = self.get_agent_inputs(
            observation_list)

        answer_distribution, _, _ = self.answer_question(
            input_observation,
            input_observation_char,
            observation_id_list,
            input_quest,
            input_quest_char,
            use_model="online")  # batch x vocab

        batch_loss = NegativeLogLoss(answer_distribution, groundtruth)  # batch
        return torch.mean(batch_loss)

    def update_qa(self):
        # update neural model by replaying snapshots in replay memory
        qa_loss = self.get_qa_loss()
        if qa_loss is None:
            return None
        loss = qa_loss * self.qa_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(qa_loss))

    def finish_of_episode(self, episode_no, batch_size):
        # Update target networt
        if (
                episode_no + batch_size
        ) % self.target_net_update_frequency <= episode_no % self.target_net_update_frequency:
            self.update_target_net()
        # decay lambdas
        if episode_no < self.learn_start_from_this_episode:
            return
        if episode_no < self.epsilon_anneal_episodes + self.learn_start_from_this_episode:
            self.epsilon -= (self.epsilon_anneal_from - self.epsilon_anneal_to
                             ) / float(self.epsilon_anneal_episodes)
            self.epsilon = max(self.epsilon, 0.0)
        if episode_no < self.revisit_counting_lambda_anneal_episodes + self.learn_start_from_this_episode:
            self.revisit_counting_lambda -= (
                self.revisit_counting_lambda_anneal_from -
                self.revisit_counting_lambda_anneal_to) / float(
                    self.revisit_counting_lambda_anneal_episodes)
            self.revisit_counting_lambda = max(self.epsilon, 0.0)

    def reset_binarized_counter(self, batch_size):
        self.binarized_counter_dict = [{} for _ in range(batch_size)]

    def get_binarized_count(self, observation_strings, update=True):
        count_rewards = []
        batch_size = len(observation_strings)
        for i in range(batch_size):
            key = observation_strings[i]
            if key not in self.binarized_counter_dict[i]:
                self.binarized_counter_dict[i][key] = 0.0
            if update:
                self.binarized_counter_dict[i][key] += 1.0
            r = self.binarized_counter_dict[i][key]
            r = float(r == 1.0)
            count_rewards.append(r)
        return count_rewards

Пример #7

Показать файл

class Actor:
    def __init__(self,
                 learner,
                 param_server,
                 actor_idx,
                 epsilon,
                 num_channels=3,
                 num_actions=19):
        # environment initialization
        import gym
        import minerl
        self.actor_idx = actor_idx
        self.env = gym.make(ENV_NAME)
        self.port_number = int("12340") + actor_idx
        print("actor environment %d initialize successfully" % self.actor_idx)
        self.env.make_interactive(port=self.port_number, realtime=False)
        self.learner_state_dict = ray.get(learner.get_state_dict.remote())
        print("getting learner state dict finished...")
        # network initalization
        self.actor_network = DQN(num_channels, num_actions).cuda()
        self.actor_target_network = DQN(num_channels, num_actions).cuda()
        self.actor_network.load_state_dict(self.learner_state_dict)
        self.actor_target_network.load_state_dict(self.learner_state_dict)
        print("actor network %d initialize successfully" % self.actor_idx)

        self.param_server = param_server
        self.epi_counter = 0
        self.max_epi = 100
        self.n_step = 4
        self.update_period = 4
        self.gamma = 0.99

        # exploring info
        self.epsilon = epsilon
        self.endEpsilon = 0.01
        self.stepDrop = (self.epsilon - self.endEpsilon) / self.max_epi
        self.local_buffer_size = 100
        self.local_buffer = deque(maxlen=self.local_buffer_size)

        self.writer = SummaryWriter(f'runs/apex/actor{self.actor_idx}')

        # 1. 네트워크 파라미터 복사
        # 2. 환경 탐험 (초기화, 행동)
        # 3. 로컬버퍼에 저장
        # 4. priority 계산
        # 5. 글로벌 버퍼에 저장
        # 6. 주기적으로 네트워크 업데이트

    def get_epi_counter(self):
        return self.epi_counter

    def update_params(self, learner):
        ray.get(self.param_server.pull_from_learner.remote(learner))
        policy_params, target_params = ray.get(
            self.param_server.push_to_actor.remote())
        self.actor_network.load_state_dict(policy_params)
        self.actor_target_network.load_state_dict(target_params)

    def append_sample(self,
                      memory,
                      state,
                      action,
                      reward,
                      next_state,
                      done,
                      n_rewards=None):
        # Caluclating Priority (TD Error)
        target = self.actor_network(state).data
        old_val = target[0][action].cpu()
        target_val = self.actor_target_network(next_state).data.cpu()
        if done:
            target[0][action] = reward
        else:
            target[0][action] = reward + 0.99 * torch.max(target_val)

        error = abs(old_val - target[0][action])
        error = error.cpu()
        state_ = state.cpu()
        next_state_ = next_state.cpu()

        if isinstance(memory, Memory):
            if n_rewards == None:
                memory.add(error, [state_, action, reward, next_state_, done])
            else:
                memory.add(
                    error,
                    (state_, action, reward, next_state_, done, n_rewards))

        else:
            if n_rewards == None:
                memory.remote.add(error,
                                  [state_, action, reward, next_state_, done])
            else:
                memory.add.remote(
                    error,
                    (state_, action, reward, next_state_, done, n_rewards))

    def explore(self, learner, memory):
        for num_epi in range(self.max_epi):
            obs = self.env.reset()
            state = converter(ENV_NAME, obs).cuda()
            state = state.float()
            done = False
            total_reward = 0
            steps = 0
            total_steps = 0
            if (self.epsilon > self.endEpsilon):
                self.epsilon -= self.stepDrop

            # initialize local_buffer
            n_step = self.n_step
            n_step_state_buffer = deque(maxlen=n_step)
            n_step_action_buffer = deque(maxlen=n_step)
            n_step_reward_buffer = deque(maxlen=n_step)
            n_step_n_rewards_buffer = deque(maxlen=n_step)
            n_step_next_state_buffer = deque(maxlen=n_step)
            n_step_done_buffer = deque(maxlen=n_step)
            gamma_list = [self.gamma**i for i in range(n_step)]

            while not done:
                steps += 1
                total_steps += 1
                a_out = self.actor_network.sample_action(state, self.epsilon)
                action_index = a_out
                action = make_19action(self.env, action_index)
                obs_prime, reward, done, info = self.env.step(action)
                total_reward += reward
                state_prime = converter(ENV_NAME, obs_prime).cuda()

                # put transition in local buffer
                n_step_state_buffer.append(state)
                n_step_action_buffer.append(action_index)
                n_step_reward_buffer.append(reward)
                n_step_next_state_buffer.append(state_prime)
                n_step_done_buffer.append(done)
                n_rewards = sum([
                    gamma * reward
                    for gamma, reward in zip(gamma_list, n_step_reward_buffer)
                ])
                n_step_n_rewards_buffer.append(n_rewards)

                if (len(n_step_state_buffer) >= n_step):
                    # Compute Priorities
                    for i in range(n_step):
                        self.append_sample(memory, n_step_state_buffer[i],
                                           n_step_action_buffer[i],
                                           n_step_reward_buffer[i],
                                           n_step_next_state_buffer[i],
                                           n_step_done_buffer[i],
                                           n_step_n_rewards_buffer[i])
                        if (n_step_done_buffer[i]):
                            break
                state = state_prime
                self.actor_network.cuda()
                self.actor_target_network.cuda()

                if done:
                    print("%d episode is done" % num_epi)
                    print("total rewards : %d " % total_reward)
                    self.writer.add_scalar('Rewards/train', total_reward,
                                           num_epi)
                    self.epi_counter += 1
                    if (num_epi % self.update_period == 0):
                        self.update_params(learner)
                    break

Пример #8

Показать файл

Файл: main.py Проект: thexl74/FlappyBirdReinforce

def train(args):
    model = DQN(game=args.game)
    if args.use_pretrained:
        pretrained_weight = torch.load(
            sorted(glob(os.path.join('ckpt', args.tag, '*.pth')))[-1])
        model.load_state_dict(pretrained_weight)
    else:
        os.makedirs(os.path.join('ckpt', args.tag), exist_ok=True)
        model.apply(init_weights)
    model = model.cuda()
    start = time.time()

    episode = 0
    iteration = 0
    epsilon = args.epsilon
    decayed = args.decayed

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # instantiate game
    game = Game(game=args.game)
    high_score = 0

    # initialize replay memory
    D = deque()

    elapsed_time = 0
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    score = game.reward
    terminal = game.game_over()

    image_data = game.get_torch_image().cuda()
    state = torch.cat(
        (image_data, image_data, image_data, image_data)).unsqueeze(0)

    start = time.time()

    while iteration < args.iteration:
        output = model(state)[0]
        action = torch.zeros([model.number_of_actions], dtype=torch.float32)

        # epsilon greedy exploration
        eps = epsilon - iteration * (epsilon - decayed) / args.iteration
        random_action = random.random() <= eps

        # Pick action --> random or index of maximum q value
        action_index = [
            torch.randint(
                model.number_of_actions, torch.Size([]), dtype=torch.int)
            if random_action else torch.argmax(output)
        ][0]
        action[action_index] = 1

        elapsed_time = time.time() - start

        # get next state and reward
        reward = game.act(action_index)
        terminal = game.game_over()
        image_data_1 = game.get_torch_image().cuda()

        state_1 = torch.cat(
            (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0).cuda()
        action = action.unsqueeze(0).cuda()
        reward = torch.from_numpy(np.array(
            [reward], dtype=np.float32)).unsqueeze(0).cuda()

        # save transition to replay memory
        D.append(
            (state.cpu(), action.cpu(), reward.cpu(), state_1.cpu(), terminal))

        # if replay memory is full, remove the oldest transition
        if len(D) > args.replayMemorySize:
            D.popleft()

        # sample random minibatch
        minibatch = random.sample(D, min(len(D), args.minibatchSize))

        state_batch = torch.cat(tuple(d[0] for d in minibatch)).cuda()
        action_batch = torch.cat(tuple(d[1] for d in minibatch)).cuda()
        reward_batch = torch.cat(tuple(d[2] for d in minibatch)).cuda()
        state_1_batch = torch.cat(tuple(d[3] for d in minibatch)).cuda()

        # get output for the next state
        output_1_batch = model(state_1_batch)

        y_batch = torch.cat(
            tuple(reward_batch[i] if minibatch[i][4] else reward_batch[i] +
                  args.gamma * torch.max(output_1_batch[i])
                  for i in range(len(minibatch))))

        # calculate with target network
        q_value = torch.sum(model(state_batch) * action_batch, dim=1)

        # LR warmup
        if iteration < 20000:
            for g in optimizer.param_groups:
                g['lr'] = args.lr * iteration / 20000

        optimizer.zero_grad()
        y_batch = y_batch.detach()
        loss = criterion(q_value, y_batch)

        loss.backward()
        optimizer.step()

        state = state_1
        iteration += 1
        score += game.reward

        args.writer.add_scalar('Train/lr', optimizer.param_groups[0]['lr'],
                               iteration)
        args.writer.add_scalar('Train/epsilon', eps, iteration)
        args.writer.add_scalar('Train/loss', loss, iteration)
        args.writer.add_scalar('Train/replay_memory', len(D), iteration)

        if terminal:
            score = score - game.reward_terminal
            args.writer.add_scalar('Episode/elapsed_time', elapsed_time,
                                   episode)
            args.writer.add_scalar('Episode/episode', episode, episode)
            args.writer.add_scalar('Episode/score', score, episode)
            game.reset_game()
            episode += 1
            start = time.time()
            print(
                'Episode {} (Iteration {}): Agent passed {} pipes!, Time: {:.3f}'
                .format(episode, iteration, score, elapsed_time))
            if score > high_score:
                print('Weight Saved!')
                high_score = score
                torch.save(
                    model,
                    os.path.join(
                        'ckpt', args.tag,
                        'E{:07d}_S{:03d}.pth'.format(episode, int(score))))
            score = 0
    print("Saving final model")
    torch.save(
        model,
        os.path.join('ckpt', args.tag,
                     'E_{:07d}_S{:03d}.pth'.format(episode, int(high_score))))

Пример #9

Показать файл

Файл: main_atari.py Проект: irustandi/ya-pytorch-rl

def main():
    parser = argparse.ArgumentParser(description='DQN Breakout Script')
    parser.add_argument('--use-cuda',
                        action='store_true',
                        default=False,
                        help='whether to use CUDA (default: False)')
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        metavar='M',
                        help='batch size (default: 128)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.999,
                        metavar='M',
                        help='gamma (default: 0.999)')
    parser.add_argument('--eps-start',
                        type=float,
                        default=0.9,
                        metavar='M',
                        help='eps start (default: 0.9)')
    parser.add_argument('--eps-end',
                        type=float,
                        default=0.05,
                        metavar='M',
                        help='eps end (default: 0.05)')
    parser.add_argument('--eps-decay',
                        type=int,
                        default=200,
                        metavar='M',
                        help='eps decay (default: 200)')
    parser.add_argument('--num-obs-in-state',
                        type=int,
                        default=4,
                        metavar='M',
                        help='num observations in state (default: 4)')
    parser.add_argument('--replay-memory-capacity',
                        type=int,
                        default=10000,
                        metavar='M',
                        help='replay memory capacity (default: 10000)')
    parser.add_argument('--num-episodes',
                        type=int,
                        default=10,
                        metavar='M',
                        help='num of episodes (default: 10)')
    parser.add_argument('--reset-period',
                        type=int,
                        default=5,
                        metavar='M',
                        help='period to reset target network (default: 5)')
    parser.add_argument('--atari-env',
                        type=str,
                        default='Breakout-v0',
                        metavar='M',
                        help='Atari environment to use (default: Breakout-v0)')
    args = parser.parse_args()

    env = gym.envs.make(args.atari_env)

    model = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0])
    model_target = DQN(args.num_obs_in_state, (84, 84),
                       env.action_space.shape[0])

    if args.use_cuda:
        model.cuda()
        model_target.cuda()

    optimizer = optim.RMSprop(model.parameters())
    memory = ReplayMemory(args.replay_memory_capacity)

    epsilons = np.linspace(args.eps_start, args.eps_end, args.eps_decay)
    step_idx = 1
    reset_idx = 1

    tfs = get_transforms()

    episode_reward = 0.
    episode_length = 0

    for i_episode in range(args.num_episodes):
        # Initialize the environment and state
        obs = env.reset()
        state_processor = StateProcessor(args.num_obs_in_state, tfs, obs)
        state = state_processor.get_state()

        while True:
            episode_length += 1
            if step_idx < args.eps_decay:
                eps = epsilons[step_idx]
            else:
                eps = args.eps_end

            action = select_action(model, state, env.action_space.shape[0],
                                   eps, args.use_cuda)
            # print('%d %d' % (episode_length, action[0,0]))
            next_obs, reward, done, info = env.step(action[0, 0])
            episode_reward += reward
            reward = torch.Tensor([reward])
            if args.use_cuda:
                reward = reward.cuda()

            if not done:
                state_processor.push_obs(next_obs)
                next_state = state_processor.get_state()
            else:
                next_state = None  # None next_state marks done

            memory.push(state, action, next_state, reward)

            # optimize
            optimize_model(optimizer, memory, model, model_target,
                           args.batch_size, args.gamma, args.use_cuda)

            step_idx += 1
            reset_idx += 1
            if reset_idx == args.reset_period:
                reset_idx = 1
                model_target.load_state_dict(model.state_dict())

            if done:
                break

        print(episode_reward)
        print(episode_length)
        episode_reward = 0.
        episode_length = 0

Пример #10

Показать файл

class QAgent(Agent):
    def __init__(self):
        self.fex = Extractor()
        self.net = DQN()
        try:
            self.net.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
        except:
            print("Starting with new weights")
            raise Exception("Weights not found")
        self.net.eval()
        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.net.parameters())
        self.memory = ReplayMemory()
        self.training = False

        self.s = None
        self.a = None
        self.score = None

    def registerInitialState(self, state):
        self.s = None
        self.a = None
        self.score = None

    def getAction(self, game_state):
        legal = game_state.getLegalPacmanActions()
        if Directions.STOP in legal: legal.remove(Directions.STOP)
        state = self.fex(game_state)
        if self.training:
            state = state.cuda()
        with torch.no_grad():
            scores = self.net(state)
        scores = list(zip(ACTIONS, scores))
        legal_scores = [p for p in scores if p[0] in legal]
        action = max(legal_scores, key = lambda p: p[1])[0]

        if self.training:
            if random.random() < EPSILON:
                action = random.choice(legal)
            if self.s is not None:
                reward = game_state.getScore() - self.score
                reward = process_reward(self.s, state, reward)
                next_legals = game_state.getLegalActions()
                if Directions.STOP in next_legals: next_legals.remove(Directions.STOP)
                next_legals = (ACTION_MAP[d] for d in next_legals)
                self.memory.push(self.s, self.a, reward, state, next_legals)
            self.s = state
            self.a = ACTION_MAP[action]
            self.score = game_state.getScore()
        return action

    def final(self, state):
        if self.training:
            reward = state.getScore() - self.score
            reward = -10
            self.memory.push(self.s, self.a, reward, None, [])


    def train(self):
        global EPSILON
        self.training = True
        self.net.cuda()
        runners, names = load_runners()

        for epoch in range(EPOCHS):
            for t in self.net.parameters():
                print(t.data)
            if epoch <= 4:
                EPSILON = [0.8, 0.5, 0.3, 0.1, 0.01][epoch]
            print('Epoch {} | EPSILON {}'.format(epoch, EPSILON))
            g_dict = {}

            for runner, name in zip(runners, names):
                games = []
                for game_idx in range(GAMES_PER_EPOCH):
                    game = runner.run_game(self)
                    games.append(game)
                    for _ in range(SAMPLES_PER_GAME):
                        self.training_iteration()

                avg = np.mean([game.state.getScore() for game in games])
                wins = sum([game.state.isWin() for game in games])
                #print(f'{name}: {avg:0.2f} | {wins}/{GAMES_PER_EPOCH}')
                print('{}: {} | {}/{}'.format(name,avg, wins, GAMES_PER_EPOCH))
            print()
            torch.save(self.net.state_dict(), 'model.pth')


    def training_iteration(self):
        # sample mini-batch
        sarsl = self.memory.sample()
        if sarsl is None:
            return
        else:
            states, actions, rewards, next_states, next_state_legals = sarsl

        # replace deaths (None) with zeros
        for i, s in enumerate(next_states):
            if s is None:
                next_states[i] = self.fex.empty()
        next_states = torch.stack(next_states) 
        # get max Q(s',a'); deaths get value 0
        with torch.no_grad():
            next_actions_values = self.net(next_states)
            best_actions_values = []
            for next_legals, action_vals in zip(next_state_legals, next_actions_values):
                legal_vals = [v for (idx,v) in enumerate(action_vals) if idx in next_legals]
                if legal_vals == []:
                    legal_vals = [0]
                best_actions_values.append(max(legal_vals))
            best_actions_values = torch.tensor(best_actions_values).cuda()
        
            # compute target values
            targets = rewards + GAMMA*best_actions_values

        # compute current action values
        actions = actions.reshape(len(actions),1)
        self.net.train()
        action_values = self.net(states).gather(1,actions).reshape(32)
        self.net.eval()
        
        # compute loss and backpropagate it
        loss = self.criterion(targets, action_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def play(self, path):
        runner = LocalPacmanGameRunner(layout_path=path,
                                       random_ghosts=True,
                                       show_window=True,
                                       zoom_window=1.0,
                                       frame_time=0.1,
                                       timeout=-1000)
        game = runner.run_game(self)

Пример #11

Показать файл

Файл: train.py Проект: yuan776/DQN-pytorch-1

# For training
var_batch_phi = autograd.Variable(torch.Tensor(batch_size, 4, 84, 84)).cuda()
var_batch_a = autograd.Variable(torch.LongTensor(batch_size, 1),
                                requires_grad=False).cuda()
var_batch_r = autograd.Variable(torch.Tensor(batch_size, 1)).cuda()
var_batch_phi_next = autograd.Variable(torch.Tensor(batch_size, 4, 84,
                                                    84)).cuda()
var_batch_r_mask = autograd.Variable(torch.Tensor(batch_size, 1),
                                     requires_grad=False).cuda()

MP = MemoryReplay(memory_size, batch_size)
dqn = DQN()
target_dqn = DQN()
target_dqn.load_state_dict(dqn.state_dict())

dqn.cuda()
target_dqn.cuda()

optimz = optim.RMSprop(dqn.parameters(),
                       lr=0.0025,
                       alpha=0.9,
                       eps=1e-02,
                       momentum=0.0)

pong = Pong()

for i in range(memory_size):
    phi = pong.current_phi
    act_index = random.randrange(3)
    phi_next, r, done = pong.step(VALID_ACTION[act_index])
    pong.display()

Пример #12

Показать файл

class Agent:
    def __init__(self, time_step, split, lr):
        self.dataset = Dataset(T=time_step,
                               split_ratio=split,
                               binary_file=config.BINARY_DATASET)
        self.policy_net_encoder = AttnEncoder(
            input_size=self.dataset.get_num_features(),
            hidden_size=config.ENCODER_HIDDEN_SIZE,
            time_step=time_step)
        self.policy_net_decoder = AttnDecoder(
            code_hidden_size=config.ENCODER_HIDDEN_SIZE,
            hidden_size=config.DECODER_HIDDEN_SIZE,
            time_step=time_step)
        self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder)
        self.target_net_encoder = AttnEncoder(
            input_size=self.dataset.get_num_features(),
            hidden_size=config.ENCODER_HIDDEN_SIZE,
            time_step=time_step)
        self.target_net_decoder = AttnDecoder(
            code_hidden_size=config.ENCODER_HIDDEN_SIZE,
            hidden_size=config.DECODER_HIDDEN_SIZE,
            time_step=time_step)
        self.target_net = DQN(self.target_net_encoder, self.target_net_decoder)
        if torch.cuda.is_available():
            self.policy_net_encoder = self.policy_net_encoder.cuda()
            self.policy_net_decoder = self.policy_net_decoder.cuda()
            self.target_net_encoder = self.target_net_encoder.cuda()
            self.target_net_decoder = self.target_net_decoder.cuda()
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()
        self.memory = ReplayMemory(config.MEMORY_CAPACITY)
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)

    def select_action(self, state, test=False):
        global steps_done
        sample = random.random()
        eps_threshold = config.EPS_END + (
            config.EPS_START - config.EPS_END) * math.exp(
                -1. * steps_done / config.EPS_DECAY)
        steps_done += 1
        if sample > eps_threshold or test == True:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            if torch.cuda.is_available():
                return torch.tensor([[random.randint(3)]],
                                    dtype=torch.long).cuda()
            else:
                return torch.tensor([[random.randint(3)]], dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < config.BATCH_SIZE:
            return
        transitions = self.memory.sample(config.BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        state_batch = tuple([
            torch.cat(
                tuple([batch.state[i][j] for i in range(config.BATCH_SIZE)]))
            for j in range(3)
        ])
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = tuple([
            torch.cat(
                tuple(
                    [batch.next_state[i][j]
                     for i in range(config.BATCH_SIZE)])) for j in range(3)
        ])
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)
        next_state_values = self.target_net(next_state_batch).max(
            1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        config.GAMMA) + reward_batch
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            if param.grad is not None:
                param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def load_model(self, encoder_path=None, decoder_path=None, DQN_path=None):
        if (DQN_path != None):
            self.policy_net.load_state_dict(
                torch.load(DQN_path,
                           map_location=lambda storage, loc: storage))
            self.target_net.load_state_dict(self.policy_net.state_dict())
        else:
            self.policy_net_encoder.load_state_dict(
                torch.load(encoder_path,
                           map_location=lambda storage, loc: storage))
            self.policy_net_decoder.load_state_dict(
                torch.load(decoder_path,
                           map_location=lambda storage, loc: storage))
            self.policy_net = DQN(self.policy_net_encoder,
                                  self.policy_net_decoder)
            self.target_net.load_state_dict(self.policy_net.state_dict())

    def train(self, num_epochs, interval):
        env = Environment(np.array([0.5, 0.5]))
        episode = 0
        for epoch in range(num_epochs):
            env.reset()
            state = (env.x[env.current_step].unsqueeze(0),
                     env.y_seq[env.current_step].unsqueeze(0),
                     env.position.unsqueeze(0))
            while (1):
                action = self.select_action(state)
                _, next_state, reward = env.step(action.item())
                if (next_state == None):
                    break
                self.memory.push(state, action, next_state, reward)
                state = next_state
                self.optimize_model()
                episode += 1
                if (episode % config.TARGET_UPDATE == 0):
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
                print(env.wealth, action, env.position)
            if (epoch + 1) % (interval) == 0 or epoch + 1 == num_epochs:
                torch.save(self.policy_net.state_dict(),
                           'models/DQN' + str(epoch + 1) + '.model')

    def test(self, num_epochs):
        env = Environment(test=True)
        state = (env.x[env.current_step], env.y_seq[env.current_step],
                 env.position)
        while (1):
            action = self.select_action(state, test=True)
            _, next_state, _ = env.step(action.item())
            if (next_state == None):
                break
            state = next_state
            print(env.wealth)

Пример #13

Показать файл

    state_var = torch.autograd.Variable(state)
    target_var = torch.autograd.Variable(target)
    target_var.unsqueeze_(0)


    import copy
    learner = DQN()
    actor = DQN()

    for param in actor.parameters():
        param.requires_grad = False

    cuda = False
    if torch.cuda.is_available():
        cuda = True
        learner = learner.cuda(0)
        reward = reward.cuda(0)
    optimizer = torch.optim.SGD(learner.parameters(), lr=0.01)
    criterion = L2_loss(0.999)

    learner.train()
    for k in range(100):
        if cuda:
            x = learner(d_state_var.cuda(0))
        else:
            x = learner(d_state_var)
        actor.load_state_dict(learner.state_dict())
        # print(x)
        for i in range(10):
            # state_var = torch.autograd.Variable(torch.randn(1, 3, 40, 40))
            y = actor(state_var)

Пример #14

Показать файл

Файл: dqn_main.py Проект: leesy714/Dark-Souls-AI

def main():
    global args, move_list, i_step
    args = parser.parse_args()


    #move_list = [x.__name__ for x in movement.__dict__.values()
    #    if inspect.isfunction(x)]
    #move_list.remove('focus')
    move_list=[]
    move_list.append('f_roll')
    move_list.append('idle')
    move_list.append('r_roll')
    move_list.append('l_roll')
    move_list.append('b_roll')
    move_list.append('light_atk')
    move_list.append('drink_estus')




    m = PyMouse(display=':0')
    k = PyKeyboard(display=':0')
    sct = mss(display=':0')

    env = DarkSoulsEnv(sct=sct, m=m, k=k)




    if args.pretrain:
        pass
    else:
        args.pretrain=None
    
    model1 = DQN(action=len(move_list), variables=3, pretrained=args.pretrain)
    if use_cuda:
        model1.cuda()

    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model1.parameters()])))

    optimizer = optim.Adam(model1.parameters(), lr=args.lr)
    #optimizer = optim.RMSprop(model.parameters())
    #optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)

    i_step = 0
    args.start_episode = 0
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_episode = checkpoint['episode']

            i_step = checkpoint['step']
            args.name = checkpoint['name']
            model1.load_state_dict(checkpoint['state_dict'])

            print("=> loaded checkpoint '{}' (epoch {})"
                      .format(args.resume, checkpoint['episode']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    model2 = copy.deepcopy(model1)



    train(model=model1, model2=model2, env=env, optimizer=optimizer)

Пример #15

Показать файл

Файл: atari_ddqn.py Проект: kaushikb258/RL_pytorch

class DDQNAgent:
    def __init__(self, config: Config, training=True):
        self.config = config
        self.is_training = training
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_shape, self.config.action_dim)
        self.target_model = DQN(self.config.state_shape,
                                self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())

        self.optim = Adam(self.model.parameters(),
                          lr=self.config.learning_rate)

        self.model.cuda()
        self.target_model.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learn(self, t):
        s, a, r, s2, done = self.buffer.sample(self.config.batch_size)

        s = torch.tensor(s, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        s2 = torch.tensor(s2, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        s = s.cuda()
        a = a.cuda()
        r = r.cuda()
        s2 = s2.cuda()
        done = done.cuda()

        q_values = self.model(s).cuda()
        next_q_values = self.model(s2).cuda()
        next_q_state_values = self.target_model(s2).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)

        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        if t % self.config.update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_checkpoint(self):
        os.makedirs('ckpt', exist_ok=True)
        torch.save(self.model.state_dict(), 'ckpt/model.pt')

    def load_checkpoint(self):
        self.model.load_state_dict('ckpt/model.pt')
        self.target_model.load_state_dict('ckpt/model.pt')

Пример #16

Показать файл

Файл: train.py Проект: amirunpri2018/NCTU_deep_learning_and_practice_sunner

model_path = 'dqn3.pth'

if __name__ == '__main__':
    # Create carpole environment and network
    env = gym.make('CartPole-v0').unwrapped
    net = DQN(n_state=env.observation_space.shape[0],
              n_action=env.action_space.n,
              memory_size=memory_size,
              lr=lr,
              epsilon=epsilon,
              epsilon_decay=epsilon_decay,
              update_iter=update_iter,
              batch_size=batch_size,
              gamma=gamma,
              model_path=model_path)
    net.cuda()
    net.load()
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        while True:
            # env.render()
            # Select action and obtain the reward
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)

            # Record the total reward
            total_reward += r

            # Revised the reward

Пример #17

Показать файл

Файл: Chapter_6_3.py Проект: PacktPublishing/Hands-On-Game-AI-with-Python

class DQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_dim, self.config.action_dim).cuda()
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):
        s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_value = next_q_values.max(1)[0]

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        # Notice that detach the expected_q_value
        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()


        return loss.item()

    def cuda(self):
        self.model.cuda()

    def load_weights(self, model_path):
        if model_path is None: return
        self.model.load_state_dict(torch.load(model_path))

    def save_model(self, output, tag=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

Пример #18

Показать файл

Файл: agent.py Проект: FreeJ99/DQN-agent-for-deep-reinforcement-learning

class Agent():
    def __init__(self, n_actions, eps_start, eps_end, eps_steps, gamma, train,
                 cuda, batch_size):
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_steps = eps_steps
        self.gamma = gamma
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.steps_done = 0

        self.policy_net = DQN(
            n_actions)  # CHANGE THESE TWO LINES FOR TESTING ON CART POLE
        self.target_net = DQN(
            n_actions)  # CHANGE THESE TWO LINES FOR TESTING ON CART POLE
        if not train:
            self.policy_net.load_state_dict(torch.load('NetParameters.txt'))
        self.update_target_net()
        if cuda:
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()

        self.criterion = nn.MSELoss()
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        #self.optimizer=optim.Adam(self.policy_net.parameters(),0.001)

    def take_action(self, state):
        r = random.random()

        epsilon = self.eps_start - (
            (self.eps_start - self.eps_end) / self.eps_steps) * self.steps_done
        #epsilon=EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.steps_done / EPS_DECAY)

        self.steps_done += 1
        if epsilon < self.eps_end:
            epsilon = self.eps_end
        if r < epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            return self.policy_net(Variable(state.cuda(
            ), volatile=True)).data.max(1)[1][
                0]  #without [0] it was a long tensor of size 1,but env.step() takes a number,which is size 0

    def optimize_model(self, memory):
        if len(memory.memory) < self.batch_size:
            return
        transitions = memory.sample(self.batch_size)
        batch_state, batch_action, batch_next_state, batch_reward = zip(
            *transitions)
        batch_state = Variable(torch.cat(batch_state)).cuda()
        batch_action = Variable(torch.cat(batch_action)).cuda()
        batch_reward = Variable(torch.cat(batch_reward)).cuda()
        batch_next_state = Variable(torch.cat(batch_next_state)).cuda()

        current_q_values = self.policy_net(batch_state).gather(
            1, batch_action.unsqueeze(1)
        )  #action was 1 dimensional,dimensions need to match batch_state

        max_next_q_values = self.target_net(batch_next_state).detach().max(
            1)[0]
        expected_q_values = batch_reward + (self.gamma * max_next_q_values)

        loss = self.criterion(current_q_values, expected_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.steps_done % 400 == 0:  #update target net
            self.update_target_net()

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save(self):
        print("Cuva model")
        torch.save(self.policy_net.state_dict(), 'NetParameters.txt')
        print("Sacuvao ga je")