Пример #1
0
def main():
    # Create carpole environment and network
    env = gym.make('CartPole-v0').unwrapped
    if not os.path.exists(model_path):
        raise Exception("You should train the DQN first!")
    net = DQN(n_state=env.observation_space.shape[0],
              n_action=env.action_space.n,
              epsilon=epsilon,
              batch_size=batch_size,
              model_path=model_path)
    net.load()
    net.cuda()
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        while True:
            # env.render()

            # Select action and obtain the reward
            a = net.chooseAction(s)
            s_, r, finish, _ = env.step(a)

            total_reward += r
            if finish:
                print("Episode: %d \t Total reward: %d \t Eps: %f" %
                      (i, total_reward, net.epsilon))
                reward_list.append(total_reward)
                break
            s = s_
    env.close()
    print("Testing average reward: ", np.mean(reward_list))
Пример #2
0
class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max,
                                      args.atoms)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space)
        if args.model and os.path.isfile(args.model):
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
        if args.cuda:
            self.online_net.cuda()
            self.target_net.cuda()
            self.support = self.support.cuda()

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[1][0]

    # Acts with an ε-greedy policy
    def act_e_greedy(self, state, epsilon=0.001):
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        # Calculate current state probabilities
        self.online_net.reset_noise()  # Sample new noise for online network
        ps = self.online_net(states)  # Probabilities p(s_t, ·; θonline)
        ps_a = ps[range(self.batch_size), actions]  # p(s_t, a_t; θonline)

        # Calculate nth next state probabilities
        self.online_net.reset_noise()  # Sample new noise for action selection
        pns = self.online_net(
            next_states).data  # Probabilities p(s_t+n, ·; θonline)
        dns = self.support.expand_as(
            pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
        argmax_indices_ns = dns.sum(2).max(
            1
        )[1]  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
        self.target_net.reset_noise()  # Sample new target net noise
        pns = self.target_net(
            next_states).data  # Probabilities p(s_t+n, ·; θtarget)
        pns_a = pns[range(
            self.batch_size
        ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

        # Compute Tz (Bellman operator T applied to z)
        Tz = returns.unsqueeze(1) + nonterminals * (
            self.discount**self.n) * self.support.unsqueeze(
                0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
        Tz = Tz.clamp(min=self.Vmin,
                      max=self.Vmax)  # Clamp between supported values
        # Compute L2 projection of Tz onto fixed support z
        b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
        l, u = b.floor().long(), b.ceil().long()
        # Fix disappearing probability mass when l = b = u (b is int)
        l[(u > 0) * (l == u)] -= 1
        u[(l < (self.atoms - 1)) * (l == u)] += 1

        # Distribute probability of Tz
        m = states.data.new(self.batch_size, self.atoms).zero_()
        offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                self.batch_size).unsqueeze(1).expand(
                                    self.batch_size,
                                    self.atoms).type_as(actions)
        m.view(-1).index_add_(
            0, (l + offset).view(-1),
            (pns_a *
             (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
        m.view(-1).index_add_(
            0, (u + offset).view(-1),
            (pns_a *
             (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        ps_a = ps_a.clamp(min=1e-3)  # Clamp for numerical stability in log
        loss = -torch.sum(
            Variable(m) * ps_a.log(),
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        (weights * loss).mean().backward()  # Importance weight losses
        self.optimiser.step()

        mem.update_priorities(
            idxs, loss.data)  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def save(self, path):
        torch.save(self.online_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[0][0]

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()
Пример #3
0
class Agent():
  def __init__(self, args, env):
    self.action_space = env.action_space()
    self.atoms = args.atoms
    self.Vmin = args.V_min
    self.Vmax = args.V_max
    self.support = torch.linspace(args.V_min, args.V_max, args.atoms)  # Support (range) of z
    self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
    self.batch_size = args.batch_size
    self.n = args.multi_step
    self.discount = args.discount
    self.priority_exponent = args.priority_exponent
    self.max_gradient_norm = args.max_gradient_norm

    self.policy_net = DQN(args, self.action_space)
    if args.model and os.path.isfile(args.model):
      self.policy_net.load_state_dict(torch.load(args.model))
    self.policy_net.train()

    self.target_net = DQN(args, self.action_space)
    self.update_target_net()
    self.target_net.eval()

    self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps)
    if args.cuda:
      self.policy_net.cuda()
      self.target_net.cuda()
      self.support = self.support.cuda()

  # Resets noisy weights in all linear layers (of policy and target nets)
  def reset_noise(self):
    self.policy_net.reset_noise()
    self.target_net.reset_noise()

  # Acts based on single state (no batch)
  def act(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0]

  def learn(self, mem):
    idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(self.batch_size)
    batch_size = len(idxs)  # May return less than specified if invalid transitions sampled

    # Calculate current state probabilities
    ps = self.policy_net(states)  # Probabilities p(s_t, ·; θpolicy)
    ps_a = ps[range(batch_size), actions]  # p(s_t, a_t; θpolicy)

    # Calculate nth next state probabilities
    pns = self.policy_net(next_states).data  # Probabilities p(s_t+n, ·; θpolicy)
    dns = self.support.expand_as(pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θpolicy))
    argmax_indices_ns = dns.sum(2).max(1)[1]  # Perform argmax action selection using policy network: argmax_a[(z, p(s_t+n, a; θpolicy))]
    pns = self.target_net(next_states).data  # Probabilities p(s_t+n, ·; θtarget)
    pns_a = pns[range(batch_size), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θpolicy))]; θtarget)
    pns_a *= nonterminals  # Set p = 0 for terminal nth next states as all possible expected returns = expected reward at final transition

    # Compute Tz (Bellman operator T applied to z)
    Tz = returns.unsqueeze(1) + nonterminals * (self.discount ** self.n) * self.support.unsqueeze(0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
    Tz = Tz.clamp(min=self.Vmin, max=self.Vmax)  # Clamp between supported values
    # Compute L2 projection of Tz onto fixed support z
    b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
    l, u = b.floor().long(), b.ceil().long()

    # Distribute probability of Tz
    m = states.data.new(batch_size, self.atoms).zero_()
    offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).long().unsqueeze(1).expand(batch_size, self.atoms).type_as(actions)
    m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
    m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

    loss = -torch.sum(Variable(m) * ps_a.log(), 1)  # Cross-entropy loss (minimises Kullback-Leibler divergence)
    self.policy_net.zero_grad()
    (weights * loss).mean().backward()  # Importance weight losses
    nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm)  # Clip gradients (normalising by max value of gradient L2 norm)
    self.optimiser.step()

    mem.update_priorities(idxs, loss.data.abs().pow(self.priority_exponent))  # Update priorities of sampled transitions

  def update_target_net(self):
    self.target_net.load_state_dict(self.policy_net.state_dict())

  def save(self, path):
    torch.save(self.policy_net.state_dict(), os.path.join(path, 'model.pth'))

  # Evaluates Q-value based on single state (no batch)
  def evaluate_q(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0]

  def train(self):
    self.policy_net.train()

  def eval(self):
    self.policy_net.eval()
Пример #4
0
dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
) else torch.FloatTensor
dlongtype = torch.cuda.LongTensor if torch.cuda.is_available(
) else torch.LongTensor
duinttype = torch.cuda.ByteTensor if torch.cuda.is_available(
) else torch.ByteTensor

Qt = DQN(in_channels=5, num_actions=18).type(dtype)
Qt_t = DQN(in_channels=5, num_actions=18).type(dtype)
Qt_t.load_state_dict(Qt.state_dict())
Qt_t.eval()
for param in Qt_t.parameters():
    param.requires_grad = False

if torch.cuda.device_count() > 0:
    Qt.cuda()
    Qt = nn.DataParallel(Qt).to(device0)
    Qt_t = nn.DataParallel(Qt_t).to(device0)
    batch_size = BATCH_SIZE * torch.cuda.device_count()
else:
    batch_size = BATCH_SIZE

# optimizer
optimizer = optim.RMSprop(Qt.parameters(),
                          lr=LEARNING_RATE,
                          alpha=ALPHA,
                          eps=EPS)

# training parameters
# Create environment
import gym
Пример #5
0
def train():
    # global args
    # args = parser.parse_args()
    Learner = DQN().to(device)

    env = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1')
    # env = retro.make(game='Airstriker-Genesis', state='Level1')

    criterion = L2_loss(0.99).to(device)

    if is_cuda:
        Learner = Learner.cuda()
        criterion = criterion.cuda()

    optimizer = optim.SGD(Learner.parameters(), lr=0.01)

    eps_threshold = 0.8
    RM = ReplayMemory(1000)
    A_agent = ActorAgent(Learner, args)
    print("Start Episodes")
    for i_episode in range(50000):
        env.reset()
        A_agent.reset(Learner, args)
        last_state = get_screen(env)
        current_state = get_screen(env)
        state = current_state - last_state
        # state_var = torch.autograd.Variable(state)
        state_var = state.to(device)
        total_reward = 0
        if i_episode % 50 == 0:
            eps_threshold = 0.9
        for t in count():
            if t == 0:
                print("episode begin")
            eps_threshold -= 0.000019
            action_q = A_agent.act(state_var, eps_threshold)
            """
            if is_cuda:
                action_q = action_q.cpu()
                _, action = action_q.data.max(2)
            else:
                _, action = action_q.data.max(2)
            """
            _, action = action_q.data.max(2)

            action_numpy = action.squeeze(0).numpy()
            # print(list(action_numpy))
            for i in range(4):
                _, reward, done, _ = env.step(action_numpy)
                total_reward += reward
            last_state = current_state
            current_state = get_screen(env)
            state = current_state - last_state
            # state_var = torch.autograd.Variable(state)
            state_var = state.to(device)
            # 行動語のstateを保存
            A_agent.add_to_buffer(reward, action_q, state_var)

            # ReplayMemoryに状態保存
            if len(A_agent.localbuffer) > 10:
                p, error = calc_priority_TDerror(Learner, criterion, A_agent,
                                                 10)

                RM.push(p, error)

            if done:
                break

            # Optimize Learner model
            # if t%100==0 and len(A_agent.localbuffer)>80 and len(RM)>=30:
        for i in range(4):
            error_batch = RM.priority_sample(30)

            optimizer.zero_grad()
            # error_batch.backward(retain_graph=True)
            error_batch.backward()
            optimizer.step()
            for param in Learner.parameters():
                param.grad.data.clamp_(-1, 1)
            optimizer.step()
            print("{0}\t{1}\tLoss:{2}\tTotal:{3}\tReward:{4}".format(
                i_episode,
                t,
                float(error_batch),
                total_reward,
                reward,
            ))
        RM.reset()
        # env.render()

        with open("total_reward.txt", "a") as f:
            f.write("{0}\t{1}".format(i_episode, total_reward))
            f.write("\n")
Пример #6
0
class Agent:
    def __init__(self):
        self.mode = "train"
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)
        print(self.config)
        self.load_config()

        self.online_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.target_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.online_net.train()
        self.target_net.train()
        self.update_target_net()
        for param in self.target_net.parameters():
            param.requires_grad = False

        if self.use_cuda:
            self.online_net.cuda()
            self.target_net.cuda()

        self.naozi = ObservationPool(capacity=self.naozi_capacity)
        # optimizer
        self.optimizer = torch.optim.Adam(
            self.online_net.parameters(),
            lr=self.config['training']['optimizer']['learning_rate'])
        self.clip_grad_norm = self.config['training']['optimizer'][
            'clip_grad_norm']

    def load_config(self):
        # word vocab
        with open("vocabularies/word_vocab.txt") as f:
            self.word_vocab = f.read().split("\n")
        self.word2id = {}
        for i, w in enumerate(self.word_vocab):
            self.word2id[w] = i
        # char vocab
        with open("vocabularies/char_vocab.txt") as f:
            self.char_vocab = f.read().split("\n")
        self.char2id = {}
        for i, w in enumerate(self.char_vocab):
            self.char2id[w] = i

        self.EOS_id = self.word2id["</s>"]
        self.train_data_size = self.config['general']['train_data_size']
        self.question_type = self.config['general']['question_type']
        self.random_map = self.config['general']['random_map']
        self.testset_path = self.config['general']['testset_path']
        self.naozi_capacity = self.config['general']['naozi_capacity']
        self.eval_folder = pjoin(
            self.testset_path, self.question_type,
            ("random_map" if self.random_map else "fixed_map"))
        self.eval_data_path = pjoin(self.testset_path, "data.json")

        self.batch_size = self.config['training']['batch_size']
        self.max_nb_steps_per_episode = self.config['training'][
            'max_nb_steps_per_episode']
        self.max_episode = self.config['training']['max_episode']
        self.target_net_update_frequency = self.config['training'][
            'target_net_update_frequency']
        self.learn_start_from_this_episode = self.config['training'][
            'learn_start_from_this_episode']

        self.run_eval = self.config['evaluate']['run_eval']
        self.eval_batch_size = self.config['evaluate']['batch_size']
        self.eval_max_nb_steps_per_episode = self.config['evaluate'][
            'max_nb_steps_per_episode']

        # Set the random seed manually for reproducibility.
        self.random_seed = self.config['general']['random_seed']
        np.random.seed(self.random_seed)
        torch.manual_seed(self.random_seed)
        if torch.cuda.is_available():
            if not self.config['general']['use_cuda']:
                print(
                    "WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml"
                )
                self.use_cuda = False
            else:
                torch.backends.cudnn.deterministic = True
                torch.cuda.manual_seed(self.random_seed)
                self.use_cuda = True
        else:
            self.use_cuda = False

        if self.question_type == "location":
            self.answer_type = "pointing"
        elif self.question_type in ["attribute", "existence"]:
            self.answer_type = "2 way"
        else:
            raise NotImplementedError

        self.save_checkpoint = self.config['checkpoint']['save_checkpoint']
        self.experiment_tag = self.config['checkpoint']['experiment_tag']
        self.save_frequency = self.config['checkpoint']['save_frequency']
        self.load_pretrained = self.config['checkpoint']['load_pretrained']
        self.load_from_tag = self.config['checkpoint']['load_from_tag']

        self.qa_loss_lambda = self.config['training']['qa_loss_lambda']
        self.interaction_loss_lambda = self.config['training'][
            'interaction_loss_lambda']

        # replay buffer and updates
        self.discount_gamma = self.config['replay']['discount_gamma']
        self.replay_batch_size = self.config['replay']['replay_batch_size']
        self.command_generation_replay_memory = command_generation_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=self.config['replay']
            ['replay_memory_priority_fraction'],
            discount_gamma=self.discount_gamma)
        self.qa_replay_memory = qa_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=0.0)
        self.update_per_k_game_steps = self.config['replay'][
            'update_per_k_game_steps']
        self.multi_step = self.config['replay']['multi_step']

        # distributional RL
        self.use_distributional = self.config['distributional']['enable']
        self.atoms = self.config['distributional']['atoms']
        self.v_min = self.config['distributional']['v_min']
        self.v_max = self.config['distributional']['v_max']
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atoms)  # Support (range) of z
        if self.use_cuda:
            self.support = self.support.cuda()
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)

        # dueling networks
        self.dueling_networks = self.config['dueling_networks']

        # double dqn
        self.double_dqn = self.config['double_dqn']

        # counting reward
        self.revisit_counting_lambda_anneal_episodes = self.config[
            'episodic_counting_bonus'][
                'revisit_counting_lambda_anneal_episodes']
        self.revisit_counting_lambda_anneal_from = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_from']
        self.revisit_counting_lambda_anneal_to = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_to']
        self.revisit_counting_lambda = self.revisit_counting_lambda_anneal_from

        # valid command bonus
        self.valid_command_bonus_lambda = self.config[
            'valid_command_bonus_lambda']

        # epsilon greedy
        self.epsilon_anneal_episodes = self.config['epsilon_greedy'][
            'epsilon_anneal_episodes']
        self.epsilon_anneal_from = self.config['epsilon_greedy'][
            'epsilon_anneal_from']
        self.epsilon_anneal_to = self.config['epsilon_greedy'][
            'epsilon_anneal_to']
        self.epsilon = self.epsilon_anneal_from
        self.noisy_net = self.config['epsilon_greedy']['noisy_net']
        if self.noisy_net:
            # disable epsilon greedy
            self.epsilon_anneal_episodes = -1
            self.epsilon = 0.0

        self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
        self.single_word_verbs = set(["inventory", "look", "wait"])
        self.two_word_verbs = set(["go"])

    def train(self):
        """
        Tell the agent that it's training phase.
        """
        self.mode = "train"
        self.online_net.train()

    def eval(self):
        """
        Tell the agent that it's evaluation phase.
        """
        self.mode = "eval"
        self.online_net.eval()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def reset_noise(self):
        if self.noisy_net:
            # Resets noisy weights in all linear layers (of online net only)
            self.online_net.reset_noise()

    def zero_noise(self):
        if self.noisy_net:
            self.online_net.zero_noise()
            self.target_net.zero_noise()

    def load_pretrained_model(self, load_from):
        """
        Load pretrained checkpoint from file.

        Arguments:
            load_from: File name of the pretrained model checkpoint.
        """
        print("loading model from %s\n" % (load_from))
        try:
            if self.use_cuda:
                state_dict = torch.load(load_from)
            else:
                state_dict = torch.load(load_from, map_location='cpu')
            self.online_net.load_state_dict(state_dict)
        except:
            print("Failed to load checkpoint...")

    def save_model_to_path(self, save_to):
        torch.save(self.online_net.state_dict(), save_to)
        print("Saved checkpoint to %s..." % (save_to))

    def init(self, obs, infos):
        """
        Prepare the agent for the upcoming games.

        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        # reset agent, get vocabulary masks for verbs / adjectives / nouns
        batch_size = len(obs)
        self.reset_binarized_counter(batch_size)
        self.not_finished_yet = np.ones((batch_size, ), dtype="float32")
        self.prev_actions = [["" for _ in range(batch_size)]]
        self.prev_step_is_still_interacting = np.ones(
            (batch_size, ), dtype="float32"
        )  # 1s and starts to be 0 when previous action is "wait"
        self.naozi.reset(batch_size=batch_size)

    def get_agent_inputs(self, string_list):
        sentence_token_list = [item.split() for item in string_list]
        sentence_id_list = [
            _words_to_ids(tokens, self.word2id)
            for tokens in sentence_token_list
        ]
        input_sentence_char = list_of_token_list_to_char_input(
            sentence_token_list, self.char2id)
        input_sentence = pad_sequences(
            sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32')
        input_sentence = to_pt(input_sentence, self.use_cuda)
        input_sentence_char = to_pt(input_sentence_char, self.use_cuda)
        return input_sentence, input_sentence_char, sentence_id_list

    def get_game_info_at_certain_step(self, obs, infos):
        """
        Get all needed info from game engine for training.
        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        batch_size = len(obs)
        feedback_strings = [preproc(item, tokenizer=self.nlp) for item in obs]
        description_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["description"]
        ]
        observation_strings = [
            d + " <|> " + fb if fb != d else d + " <|> hello"
            for fb, d in zip(feedback_strings, description_strings)
        ]

        inventory_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["inventory"]
        ]
        local_word_list = [
            obs.split() + inv.split()
            for obs, inv in zip(observation_strings, inventory_strings)
        ]

        directions = ["east", "west", "north", "south"]
        if self.question_type in ["location", "existence"]:
            # agents observes the env, but do not change them
            possible_verbs = [["go", "inventory", "wait", "open", "examine"]
                              for _ in range(batch_size)]
        else:
            possible_verbs = [
                list(set(item) - set(["", "look"])) for item in infos["verbs"]
            ]

        possible_adjs, possible_nouns = [], []
        for i in range(batch_size):
            object_nouns = [
                item.split()[-1] for item in infos["object_nouns"][i]
            ]
            object_adjs = [
                w for item in infos["object_adjs"][i] for w in item.split()
            ]
            possible_nouns.append(
                list(set(object_nouns) & set(local_word_list[i]) - set([""])) +
                directions)
            possible_adjs.append(
                list(set(object_adjs) & set(local_word_list[i]) - set([""])) +
                ["</s>"])

        return observation_strings, [
            possible_verbs, possible_adjs, possible_nouns
        ]

    def get_state_strings(self, infos):
        description_strings = infos["description"]
        inventory_strings = infos["inventory"]
        observation_strings = [
            _d + _i for (_d, _i) in zip(description_strings, inventory_strings)
        ]
        return observation_strings

    def get_local_word_masks(self, possible_words):
        possible_verbs, possible_adjs, possible_nouns = possible_words
        batch_size = len(possible_verbs)

        verb_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        noun_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        adj_mask = np.zeros((batch_size, len(self.word_vocab)),
                            dtype="float32")
        for i in range(batch_size):
            for w in possible_verbs[i]:
                if w in self.word2id:
                    verb_mask[i][self.word2id[w]] = 1.0
            for w in possible_adjs[i]:
                if w in self.word2id:
                    adj_mask[i][self.word2id[w]] = 1.0
            for w in possible_nouns[i]:
                if w in self.word2id:
                    noun_mask[i][self.word2id[w]] = 1.0
        adj_mask[:, self.EOS_id] = 1.0

        return [verb_mask, adj_mask, noun_mask]

    def get_match_representations(self,
                                  input_observation,
                                  input_observation_char,
                                  input_quest,
                                  input_quest_char,
                                  use_model="online"):
        model = self.online_net if use_model == "online" else self.target_net
        description_representation_sequence, description_mask = model.representation_generator(
            input_observation, input_observation_char)
        quest_representation_sequence, quest_mask = model.representation_generator(
            input_quest, input_quest_char)

        match_representation_sequence = model.get_match_representations(
            description_representation_sequence, description_mask,
            quest_representation_sequence, quest_mask)
        match_representation_sequence = match_representation_sequence * description_mask.unsqueeze(
            -1)
        return match_representation_sequence

    def get_ranks(self,
                  input_observation,
                  input_observation_char,
                  input_quest,
                  input_quest_char,
                  word_masks,
                  use_model="online"):
        """
        Given input observation and question tensors, to get Q values of words.
        """
        model = self.online_net if use_model == "online" else self.target_net
        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        action_ranks = model.action_scorer(match_representation_sequence,
                                           word_masks)  # list of 3 tensors
        return action_ranks

    def choose_maxQ_command(self, action_ranks, word_mask=None):
        """
        Generate a command by maximum q values, for epsilon greedy.
        """
        if self.use_distributional:
            action_ranks = [
                (item * self.support).sum(2) for item in action_ranks
            ]  # list of batch x n_vocab
        action_indices = []
        for i in range(len(action_ranks)):
            ar = action_ranks[i]
            ar = ar - torch.min(
                ar, -1, keepdim=True
            )[0] + 1e-2  # minus the min value, so that all values are non-negative
            if word_mask is not None:
                assert word_mask[i].size() == ar.size(), (
                    word_mask[i].size().shape, ar.size())
                ar = ar * word_mask[i]
            action_indices.append(torch.argmax(ar, -1))  # batch
        return action_indices

    def choose_random_command(self,
                              batch_size,
                              action_space_size,
                              possible_words=None):
        """
        Generate a command randomly, for epsilon greedy.
        """
        action_indices = []
        for i in range(3):
            if possible_words is None:
                indices = np.random.choice(action_space_size, batch_size)
            else:
                indices = []
                for j in range(batch_size):
                    mask_ids = []
                    for w in possible_words[i][j]:
                        if w in self.word2id:
                            mask_ids.append(self.word2id[w])
                    indices.append(np.random.choice(mask_ids))
                indices = np.array(indices)
            action_indices.append(to_pt(indices, self.use_cuda))  # batch
        return action_indices

    def get_chosen_strings(self, chosen_indices):
        """
        Turns list of word indices into actual command strings.
        chosen_indices: Word indices chosen by model.
        """
        chosen_indices_np = [to_np(item) for item in chosen_indices]
        res_str = []
        batch_size = chosen_indices_np[0].shape[0]
        for i in range(batch_size):
            verb, adj, noun = chosen_indices_np[0][i], chosen_indices_np[1][
                i], chosen_indices_np[2][i]
            res_str.append(self.word_ids_to_commands(verb, adj, noun))
        return res_str

    def word_ids_to_commands(self, verb, adj, noun):
        """
        Turn the 3 indices into actual command strings.

        Arguments:
            verb: Index of the guessing verb in vocabulary
            adj: Index of the guessing adjective in vocabulary
            noun: Index of the guessing noun in vocabulary
        """
        # turns 3 indices into actual command strings
        if self.word_vocab[verb] in self.single_word_verbs:
            return self.word_vocab[verb]
        if self.word_vocab[verb] in self.two_word_verbs:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        if adj == self.EOS_id:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        else:
            return " ".join([
                self.word_vocab[verb], self.word_vocab[adj],
                self.word_vocab[noun]
            ])

    def act_random(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        with torch.no_grad():
            batch_size = len(obs)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)
            chosen_indices = word_indices_random
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act_greedy(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            batch_size = len(obs)
            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            chosen_indices = word_indices_maxq
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act(self,
            obs,
            infos,
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            possible_words,
            random=False):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            if self.mode == "eval":
                return self.act_greedy(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            if random:
                return self.act_random(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            batch_size = len(obs)

            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)

            # random number for epsilon greedy
            rand_num = np.random.uniform(low=0.0,
                                         high=1.0,
                                         size=(batch_size, ))
            less_than_epsilon = (rand_num < self.epsilon).astype(
                "float32")  # batch
            greater_than_epsilon = 1.0 - less_than_epsilon
            less_than_epsilon = to_pt(less_than_epsilon,
                                      self.use_cuda,
                                      type='long')
            greater_than_epsilon = to_pt(greater_than_epsilon,
                                         self.use_cuda,
                                         type='long')
            chosen_indices = [
                less_than_epsilon * idx_random +
                greater_than_epsilon * idx_maxq
                for idx_random, idx_maxq in zip(word_indices_random,
                                                word_indices_maxq)
            ]
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def get_dqn_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.command_generation_replay_memory) < self.replay_batch_size:
            return None

        data = self.command_generation_replay_memory.get_batch(
            self.replay_batch_size, self.multi_step)
        if data is None:
            return None

        obs_list, quest_list, possible_words_list, chosen_indices, rewards, next_obs_list, next_possible_words_list, actual_n_list = data
        batch_size = len(actual_n_list)

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, _ = self.get_agent_inputs(
            obs_list)
        next_input_observation, next_input_observation_char, _ = self.get_agent_inputs(
            next_obs_list)

        possible_words, next_possible_words = [], []
        for i in range(3):
            possible_words.append([item[i] for item in possible_words_list])
            next_possible_words.append(
                [item[i] for item in next_possible_words_list])

        local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(possible_words)
        ]
        next_local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(next_possible_words)
        ]

        action_ranks = self.get_ranks(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            local_word_masks,
            use_model="online"
        )  # list of batch x vocab or list of batch x vocab x atoms
        # ps_a
        word_qvalues = [
            ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1)
            for w_rank, idx in zip(action_ranks, chosen_indices)
        ]  # list of batch or list of batch x atoms
        q_value = torch.mean(torch.stack(word_qvalues, -1),
                             -1)  # batch or batch x atoms
        # log_ps_a
        log_q_value = torch.log(q_value)  # batch or batch x atoms

        with torch.no_grad():
            if self.noisy_net:
                self.target_net.reset_noise()  # Sample new target net noise
            if self.double_dqn:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="online")
                # list of batch x vocab or list of batch x vocab x atoms
                # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                # pns # Probabilities p(s_t+n, ·; θtarget)
                next_action_ranks = self.get_ranks(
                    next_input_observation,
                    next_input_observation_char,
                    input_quest,
                    input_quest_char,
                    next_local_word_masks,
                    use_model="target"
                )  # batch x vocab or list of batch x vocab x atoms
                # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms
            else:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="target")
                # list of batch x vocab or list of batch x vocab x atoms
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms

            next_q_value = torch.mean(torch.stack(next_word_qvalues, -1),
                                      -1)  # batch or batch x atoms
            # Compute Tz (Bellman operator T applied to z)
            discount = to_pt((np.ones_like(actual_n_list) *
                              self.discount_gamma)**actual_n_list,
                             self.use_cuda,
                             type="float")
        if not self.use_distributional:
            rewards = rewards + next_q_value * discount  # batch
            loss = F.smooth_l1_loss(q_value, rewards)
            return loss

        with torch.no_grad():
            Tz = rewards.unsqueeze(
                -1) + discount.unsqueeze(-1) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.v_min,
                          max=self.v_max)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.v_min) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = torch.zeros(batch_size, self.atoms).float()
            if self.use_cuda:
                m = m.cuda()
            offset = torch.linspace(0, ((batch_size - 1) * self.atoms),
                                    batch_size).unsqueeze(1).expand(
                                        batch_size, self.atoms).long()
            if self.use_cuda:
                offset = offset.cuda()
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (next_q_value *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (next_q_value *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_q_value,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        loss = torch.mean(loss)
        return loss

    def update_interaction(self):
        # update neural model by replaying snapshots in replay memory
        interaction_loss = self.get_dqn_loss()
        if interaction_loss is None:
            return None
        loss = interaction_loss * self.interaction_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(interaction_loss))

    def answer_question(self,
                        input_observation,
                        input_observation_char,
                        observation_id_list,
                        input_quest,
                        input_quest_char,
                        use_model="online"):
        # first pad answerer_input, and get the mask
        model = self.online_net if use_model == "online" else self.target_net
        batch_size = len(observation_id_list)
        max_length = input_observation.size(1)
        mask = compute_mask(input_observation)  # batch x obs_len

        # noun mask for location question
        if self.question_type in ["location"]:
            location_mask = []
            for i in range(batch_size):
                m = [1 for item in observation_id_list[i]]
                location_mask.append(m)
            location_mask = pad_sequences(location_mask,
                                          maxlen=max_length,
                                          dtype="float32")
            location_mask = to_pt(location_mask,
                                  enable_cuda=self.use_cuda,
                                  type='float')
            assert mask.size() == location_mask.size()
            mask = mask * location_mask

        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        pred = model.answer_question(match_representation_sequence,
                                     mask)  # batch x vocab or batch x 2

        # attention sum:
        # sometimes certain word appears multiple times in the observation,
        # thus we need to merge them together before doing further computations
        # ------- but
        # if answer type is not pointing, we just use a pre-defined mapping
        # that maps 0/1 to their positions in vocab
        if self.answer_type == "2 way":
            observation_id_list = []
            max_length = 2
            for i in range(batch_size):
                observation_id_list.append(
                    [self.word2id["0"], self.word2id["1"]])

        observation = to_pt(
            pad_sequences(observation_id_list,
                          maxlen=max_length).astype('int32'), self.use_cuda)
        vocab_distribution = np.zeros(
            (batch_size, len(self.word_vocab)))  # batch x vocab
        vocab_distribution = to_pt(vocab_distribution,
                                   self.use_cuda,
                                   type='float')
        vocab_distribution = vocab_distribution.scatter_add_(
            1, observation, pred)  # batch x vocab
        non_zero_words = []
        for i in range(batch_size):
            non_zero_words.append(list(set(observation_id_list[i])))
        vocab_mask = torch.ne(vocab_distribution, 0).float()
        return vocab_distribution, non_zero_words, vocab_mask

    def point_maxq_position(self, vocab_distribution, mask):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position (mapped to vocab).
            mask: vocab masks.
        """
        vocab_distribution = vocab_distribution - torch.min(
            vocab_distribution, -1, keepdim=True
        )[0] + 1e-2  # minus the min value, so that all values are non-negative
        vocab_distribution = vocab_distribution * mask  # batch x vocab
        indices = torch.argmax(vocab_distribution, -1)  # batch
        return indices

    def answer_question_act_greedy(self, input_observation,
                                   input_observation_char, observation_id_list,
                                   input_quest, input_quest_char):

        with torch.no_grad():
            vocab_distribution, _, vocab_mask = self.answer_question(
                input_observation,
                input_observation_char,
                observation_id_list,
                input_quest,
                input_quest_char,
                use_model="online")  # batch x time
            positions_maxq = self.point_maxq_position(vocab_distribution,
                                                      vocab_mask)
            return positions_maxq  # batch

    def get_qa_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.qa_replay_memory) < self.replay_batch_size:
            return None
        transitions = self.qa_replay_memory.sample(self.replay_batch_size)
        batch = qa_memory.qa_Transition(*zip(*transitions))

        observation_list = batch.observation_list
        quest_list = batch.quest_list
        answer_strings = batch.answer_strings
        answer_position = np.array(_words_to_ids(answer_strings, self.word2id))
        groundtruth = to_pt(answer_position, self.use_cuda)  # batch

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, observation_id_list = self.get_agent_inputs(
            observation_list)

        answer_distribution, _, _ = self.answer_question(
            input_observation,
            input_observation_char,
            observation_id_list,
            input_quest,
            input_quest_char,
            use_model="online")  # batch x vocab

        batch_loss = NegativeLogLoss(answer_distribution, groundtruth)  # batch
        return torch.mean(batch_loss)

    def update_qa(self):
        # update neural model by replaying snapshots in replay memory
        qa_loss = self.get_qa_loss()
        if qa_loss is None:
            return None
        loss = qa_loss * self.qa_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(qa_loss))

    def finish_of_episode(self, episode_no, batch_size):
        # Update target networt
        if (
                episode_no + batch_size
        ) % self.target_net_update_frequency <= episode_no % self.target_net_update_frequency:
            self.update_target_net()
        # decay lambdas
        if episode_no < self.learn_start_from_this_episode:
            return
        if episode_no < self.epsilon_anneal_episodes + self.learn_start_from_this_episode:
            self.epsilon -= (self.epsilon_anneal_from - self.epsilon_anneal_to
                             ) / float(self.epsilon_anneal_episodes)
            self.epsilon = max(self.epsilon, 0.0)
        if episode_no < self.revisit_counting_lambda_anneal_episodes + self.learn_start_from_this_episode:
            self.revisit_counting_lambda -= (
                self.revisit_counting_lambda_anneal_from -
                self.revisit_counting_lambda_anneal_to) / float(
                    self.revisit_counting_lambda_anneal_episodes)
            self.revisit_counting_lambda = max(self.epsilon, 0.0)

    def reset_binarized_counter(self, batch_size):
        self.binarized_counter_dict = [{} for _ in range(batch_size)]

    def get_binarized_count(self, observation_strings, update=True):
        count_rewards = []
        batch_size = len(observation_strings)
        for i in range(batch_size):
            key = observation_strings[i]
            if key not in self.binarized_counter_dict[i]:
                self.binarized_counter_dict[i][key] = 0.0
            if update:
                self.binarized_counter_dict[i][key] += 1.0
            r = self.binarized_counter_dict[i][key]
            r = float(r == 1.0)
            count_rewards.append(r)
        return count_rewards
Пример #7
0
class Actor:
    def __init__(self,
                 learner,
                 param_server,
                 actor_idx,
                 epsilon,
                 num_channels=3,
                 num_actions=19):
        # environment initialization
        import gym
        import minerl
        self.actor_idx = actor_idx
        self.env = gym.make(ENV_NAME)
        self.port_number = int("12340") + actor_idx
        print("actor environment %d initialize successfully" % self.actor_idx)
        self.env.make_interactive(port=self.port_number, realtime=False)
        self.learner_state_dict = ray.get(learner.get_state_dict.remote())
        print("getting learner state dict finished...")
        # network initalization
        self.actor_network = DQN(num_channels, num_actions).cuda()
        self.actor_target_network = DQN(num_channels, num_actions).cuda()
        self.actor_network.load_state_dict(self.learner_state_dict)
        self.actor_target_network.load_state_dict(self.learner_state_dict)
        print("actor network %d initialize successfully" % self.actor_idx)

        self.param_server = param_server
        self.epi_counter = 0
        self.max_epi = 100
        self.n_step = 4
        self.update_period = 4
        self.gamma = 0.99

        # exploring info
        self.epsilon = epsilon
        self.endEpsilon = 0.01
        self.stepDrop = (self.epsilon - self.endEpsilon) / self.max_epi
        self.local_buffer_size = 100
        self.local_buffer = deque(maxlen=self.local_buffer_size)

        self.writer = SummaryWriter(f'runs/apex/actor{self.actor_idx}')

        # 1. 네트워크 파라미터 복사
        # 2. 환경 탐험 (초기화, 행동)
        # 3. 로컬버퍼에 저장
        # 4. priority 계산
        # 5. 글로벌 버퍼에 저장
        # 6. 주기적으로 네트워크 업데이트

    def get_epi_counter(self):
        return self.epi_counter

    def update_params(self, learner):
        ray.get(self.param_server.pull_from_learner.remote(learner))
        policy_params, target_params = ray.get(
            self.param_server.push_to_actor.remote())
        self.actor_network.load_state_dict(policy_params)
        self.actor_target_network.load_state_dict(target_params)

    def append_sample(self,
                      memory,
                      state,
                      action,
                      reward,
                      next_state,
                      done,
                      n_rewards=None):
        # Caluclating Priority (TD Error)
        target = self.actor_network(state).data
        old_val = target[0][action].cpu()
        target_val = self.actor_target_network(next_state).data.cpu()
        if done:
            target[0][action] = reward
        else:
            target[0][action] = reward + 0.99 * torch.max(target_val)

        error = abs(old_val - target[0][action])
        error = error.cpu()
        state_ = state.cpu()
        next_state_ = next_state.cpu()

        if isinstance(memory, Memory):
            if n_rewards == None:
                memory.add(error, [state_, action, reward, next_state_, done])
            else:
                memory.add(
                    error,
                    (state_, action, reward, next_state_, done, n_rewards))

        else:
            if n_rewards == None:
                memory.remote.add(error,
                                  [state_, action, reward, next_state_, done])
            else:
                memory.add.remote(
                    error,
                    (state_, action, reward, next_state_, done, n_rewards))

    def explore(self, learner, memory):
        for num_epi in range(self.max_epi):
            obs = self.env.reset()
            state = converter(ENV_NAME, obs).cuda()
            state = state.float()
            done = False
            total_reward = 0
            steps = 0
            total_steps = 0
            if (self.epsilon > self.endEpsilon):
                self.epsilon -= self.stepDrop

            # initialize local_buffer
            n_step = self.n_step
            n_step_state_buffer = deque(maxlen=n_step)
            n_step_action_buffer = deque(maxlen=n_step)
            n_step_reward_buffer = deque(maxlen=n_step)
            n_step_n_rewards_buffer = deque(maxlen=n_step)
            n_step_next_state_buffer = deque(maxlen=n_step)
            n_step_done_buffer = deque(maxlen=n_step)
            gamma_list = [self.gamma**i for i in range(n_step)]

            while not done:
                steps += 1
                total_steps += 1
                a_out = self.actor_network.sample_action(state, self.epsilon)
                action_index = a_out
                action = make_19action(self.env, action_index)
                obs_prime, reward, done, info = self.env.step(action)
                total_reward += reward
                state_prime = converter(ENV_NAME, obs_prime).cuda()

                # put transition in local buffer
                n_step_state_buffer.append(state)
                n_step_action_buffer.append(action_index)
                n_step_reward_buffer.append(reward)
                n_step_next_state_buffer.append(state_prime)
                n_step_done_buffer.append(done)
                n_rewards = sum([
                    gamma * reward
                    for gamma, reward in zip(gamma_list, n_step_reward_buffer)
                ])
                n_step_n_rewards_buffer.append(n_rewards)

                if (len(n_step_state_buffer) >= n_step):
                    # Compute Priorities
                    for i in range(n_step):
                        self.append_sample(memory, n_step_state_buffer[i],
                                           n_step_action_buffer[i],
                                           n_step_reward_buffer[i],
                                           n_step_next_state_buffer[i],
                                           n_step_done_buffer[i],
                                           n_step_n_rewards_buffer[i])
                        if (n_step_done_buffer[i]):
                            break
                state = state_prime
                self.actor_network.cuda()
                self.actor_target_network.cuda()

                if done:
                    print("%d episode is done" % num_epi)
                    print("total rewards : %d " % total_reward)
                    self.writer.add_scalar('Rewards/train', total_reward,
                                           num_epi)
                    self.epi_counter += 1
                    if (num_epi % self.update_period == 0):
                        self.update_params(learner)
                    break
Пример #8
0
def train(args):
    model = DQN(game=args.game)
    if args.use_pretrained:
        pretrained_weight = torch.load(
            sorted(glob(os.path.join('ckpt', args.tag, '*.pth')))[-1])
        model.load_state_dict(pretrained_weight)
    else:
        os.makedirs(os.path.join('ckpt', args.tag), exist_ok=True)
        model.apply(init_weights)
    model = model.cuda()
    start = time.time()

    episode = 0
    iteration = 0
    epsilon = args.epsilon
    decayed = args.decayed

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # instantiate game
    game = Game(game=args.game)
    high_score = 0

    # initialize replay memory
    D = deque()

    elapsed_time = 0
    action = torch.zeros([model.number_of_actions], dtype=torch.float32)
    score = game.reward
    terminal = game.game_over()

    image_data = game.get_torch_image().cuda()
    state = torch.cat(
        (image_data, image_data, image_data, image_data)).unsqueeze(0)

    start = time.time()

    while iteration < args.iteration:
        output = model(state)[0]
        action = torch.zeros([model.number_of_actions], dtype=torch.float32)

        # epsilon greedy exploration
        eps = epsilon - iteration * (epsilon - decayed) / args.iteration
        random_action = random.random() <= eps

        # Pick action --> random or index of maximum q value
        action_index = [
            torch.randint(
                model.number_of_actions, torch.Size([]), dtype=torch.int)
            if random_action else torch.argmax(output)
        ][0]
        action[action_index] = 1

        elapsed_time = time.time() - start

        # get next state and reward
        reward = game.act(action_index)
        terminal = game.game_over()
        image_data_1 = game.get_torch_image().cuda()

        state_1 = torch.cat(
            (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0).cuda()
        action = action.unsqueeze(0).cuda()
        reward = torch.from_numpy(np.array(
            [reward], dtype=np.float32)).unsqueeze(0).cuda()

        # save transition to replay memory
        D.append(
            (state.cpu(), action.cpu(), reward.cpu(), state_1.cpu(), terminal))

        # if replay memory is full, remove the oldest transition
        if len(D) > args.replayMemorySize:
            D.popleft()

        # sample random minibatch
        minibatch = random.sample(D, min(len(D), args.minibatchSize))

        state_batch = torch.cat(tuple(d[0] for d in minibatch)).cuda()
        action_batch = torch.cat(tuple(d[1] for d in minibatch)).cuda()
        reward_batch = torch.cat(tuple(d[2] for d in minibatch)).cuda()
        state_1_batch = torch.cat(tuple(d[3] for d in minibatch)).cuda()

        # get output for the next state
        output_1_batch = model(state_1_batch)

        y_batch = torch.cat(
            tuple(reward_batch[i] if minibatch[i][4] else reward_batch[i] +
                  args.gamma * torch.max(output_1_batch[i])
                  for i in range(len(minibatch))))

        # calculate with target network
        q_value = torch.sum(model(state_batch) * action_batch, dim=1)

        # LR warmup
        if iteration < 20000:
            for g in optimizer.param_groups:
                g['lr'] = args.lr * iteration / 20000

        optimizer.zero_grad()
        y_batch = y_batch.detach()
        loss = criterion(q_value, y_batch)

        loss.backward()
        optimizer.step()

        state = state_1
        iteration += 1
        score += game.reward

        args.writer.add_scalar('Train/lr', optimizer.param_groups[0]['lr'],
                               iteration)
        args.writer.add_scalar('Train/epsilon', eps, iteration)
        args.writer.add_scalar('Train/loss', loss, iteration)
        args.writer.add_scalar('Train/replay_memory', len(D), iteration)

        if terminal:
            score = score - game.reward_terminal
            args.writer.add_scalar('Episode/elapsed_time', elapsed_time,
                                   episode)
            args.writer.add_scalar('Episode/episode', episode, episode)
            args.writer.add_scalar('Episode/score', score, episode)
            game.reset_game()
            episode += 1
            start = time.time()
            print(
                'Episode {} (Iteration {}): Agent passed {} pipes!, Time: {:.3f}'
                .format(episode, iteration, score, elapsed_time))
            if score > high_score:
                print('Weight Saved!')
                high_score = score
                torch.save(
                    model,
                    os.path.join(
                        'ckpt', args.tag,
                        'E{:07d}_S{:03d}.pth'.format(episode, int(score))))
            score = 0
    print("Saving final model")
    torch.save(
        model,
        os.path.join('ckpt', args.tag,
                     'E_{:07d}_S{:03d}.pth'.format(episode, int(high_score))))
Пример #9
0
def main():
    parser = argparse.ArgumentParser(description='DQN Breakout Script')
    parser.add_argument('--use-cuda',
                        action='store_true',
                        default=False,
                        help='whether to use CUDA (default: False)')
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        metavar='M',
                        help='batch size (default: 128)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.999,
                        metavar='M',
                        help='gamma (default: 0.999)')
    parser.add_argument('--eps-start',
                        type=float,
                        default=0.9,
                        metavar='M',
                        help='eps start (default: 0.9)')
    parser.add_argument('--eps-end',
                        type=float,
                        default=0.05,
                        metavar='M',
                        help='eps end (default: 0.05)')
    parser.add_argument('--eps-decay',
                        type=int,
                        default=200,
                        metavar='M',
                        help='eps decay (default: 200)')
    parser.add_argument('--num-obs-in-state',
                        type=int,
                        default=4,
                        metavar='M',
                        help='num observations in state (default: 4)')
    parser.add_argument('--replay-memory-capacity',
                        type=int,
                        default=10000,
                        metavar='M',
                        help='replay memory capacity (default: 10000)')
    parser.add_argument('--num-episodes',
                        type=int,
                        default=10,
                        metavar='M',
                        help='num of episodes (default: 10)')
    parser.add_argument('--reset-period',
                        type=int,
                        default=5,
                        metavar='M',
                        help='period to reset target network (default: 5)')
    parser.add_argument('--atari-env',
                        type=str,
                        default='Breakout-v0',
                        metavar='M',
                        help='Atari environment to use (default: Breakout-v0)')
    args = parser.parse_args()

    env = gym.envs.make(args.atari_env)

    model = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0])
    model_target = DQN(args.num_obs_in_state, (84, 84),
                       env.action_space.shape[0])

    if args.use_cuda:
        model.cuda()
        model_target.cuda()

    optimizer = optim.RMSprop(model.parameters())
    memory = ReplayMemory(args.replay_memory_capacity)

    epsilons = np.linspace(args.eps_start, args.eps_end, args.eps_decay)
    step_idx = 1
    reset_idx = 1

    tfs = get_transforms()

    episode_reward = 0.
    episode_length = 0

    for i_episode in range(args.num_episodes):
        # Initialize the environment and state
        obs = env.reset()
        state_processor = StateProcessor(args.num_obs_in_state, tfs, obs)
        state = state_processor.get_state()

        while True:
            episode_length += 1
            if step_idx < args.eps_decay:
                eps = epsilons[step_idx]
            else:
                eps = args.eps_end

            action = select_action(model, state, env.action_space.shape[0],
                                   eps, args.use_cuda)
            # print('%d %d' % (episode_length, action[0,0]))
            next_obs, reward, done, info = env.step(action[0, 0])
            episode_reward += reward
            reward = torch.Tensor([reward])
            if args.use_cuda:
                reward = reward.cuda()

            if not done:
                state_processor.push_obs(next_obs)
                next_state = state_processor.get_state()
            else:
                next_state = None  # None next_state marks done

            memory.push(state, action, next_state, reward)

            # optimize
            optimize_model(optimizer, memory, model, model_target,
                           args.batch_size, args.gamma, args.use_cuda)

            step_idx += 1
            reset_idx += 1
            if reset_idx == args.reset_period:
                reset_idx = 1
                model_target.load_state_dict(model.state_dict())

            if done:
                break

        print(episode_reward)
        print(episode_length)
        episode_reward = 0.
        episode_length = 0
Пример #10
0
class QAgent(Agent):
    def __init__(self):
        self.fex = Extractor()
        self.net = DQN()
        try:
            self.net.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
        except:
            print("Starting with new weights")
            raise Exception("Weights not found")
        self.net.eval()
        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.net.parameters())
        self.memory = ReplayMemory()
        self.training = False

        self.s = None
        self.a = None
        self.score = None

    def registerInitialState(self, state):
        self.s = None
        self.a = None
        self.score = None

    def getAction(self, game_state):
        legal = game_state.getLegalPacmanActions()
        if Directions.STOP in legal: legal.remove(Directions.STOP)
        state = self.fex(game_state)
        if self.training:
            state = state.cuda()
        with torch.no_grad():
            scores = self.net(state)
        scores = list(zip(ACTIONS, scores))
        legal_scores = [p for p in scores if p[0] in legal]
        action = max(legal_scores, key = lambda p: p[1])[0]

        if self.training:
            if random.random() < EPSILON:
                action = random.choice(legal)
            if self.s is not None:
                reward = game_state.getScore() - self.score
                reward = process_reward(self.s, state, reward)
                next_legals = game_state.getLegalActions()
                if Directions.STOP in next_legals: next_legals.remove(Directions.STOP)
                next_legals = (ACTION_MAP[d] for d in next_legals)
                self.memory.push(self.s, self.a, reward, state, next_legals)
            self.s = state
            self.a = ACTION_MAP[action]
            self.score = game_state.getScore()
        return action

    def final(self, state):
        if self.training:
            reward = state.getScore() - self.score
            reward = -10
            self.memory.push(self.s, self.a, reward, None, [])


    def train(self):
        global EPSILON
        self.training = True
        self.net.cuda()
        runners, names = load_runners()

        for epoch in range(EPOCHS):
            for t in self.net.parameters():
                print(t.data)
            if epoch <= 4:
                EPSILON = [0.8, 0.5, 0.3, 0.1, 0.01][epoch]
            print('Epoch {} | EPSILON {}'.format(epoch, EPSILON))
            g_dict = {}

            for runner, name in zip(runners, names):
                games = []
                for game_idx in range(GAMES_PER_EPOCH):
                    game = runner.run_game(self)
                    games.append(game)
                    for _ in range(SAMPLES_PER_GAME):
                        self.training_iteration()

                avg = np.mean([game.state.getScore() for game in games])
                wins = sum([game.state.isWin() for game in games])
                #print(f'{name}: {avg:0.2f} | {wins}/{GAMES_PER_EPOCH}')
                print('{}: {} | {}/{}'.format(name,avg, wins, GAMES_PER_EPOCH))
            print()
            torch.save(self.net.state_dict(), 'model.pth')


    def training_iteration(self):
        # sample mini-batch
        sarsl = self.memory.sample()
        if sarsl is None:
            return
        else:
            states, actions, rewards, next_states, next_state_legals = sarsl

        # replace deaths (None) with zeros
        for i, s in enumerate(next_states):
            if s is None:
                next_states[i] = self.fex.empty()
        next_states = torch.stack(next_states) 
        # get max Q(s',a'); deaths get value 0
        with torch.no_grad():
            next_actions_values = self.net(next_states)
            best_actions_values = []
            for next_legals, action_vals in zip(next_state_legals, next_actions_values):
                legal_vals = [v for (idx,v) in enumerate(action_vals) if idx in next_legals]
                if legal_vals == []:
                    legal_vals = [0]
                best_actions_values.append(max(legal_vals))
            best_actions_values = torch.tensor(best_actions_values).cuda()
        
            # compute target values
            targets = rewards + GAMMA*best_actions_values

        # compute current action values
        actions = actions.reshape(len(actions),1)
        self.net.train()
        action_values = self.net(states).gather(1,actions).reshape(32)
        self.net.eval()
        
        # compute loss and backpropagate it
        loss = self.criterion(targets, action_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def play(self, path):
        runner = LocalPacmanGameRunner(layout_path=path,
                                       random_ghosts=True,
                                       show_window=True,
                                       zoom_window=1.0,
                                       frame_time=0.1,
                                       timeout=-1000)
        game = runner.run_game(self)
Пример #11
0
# For training
var_batch_phi = autograd.Variable(torch.Tensor(batch_size, 4, 84, 84)).cuda()
var_batch_a = autograd.Variable(torch.LongTensor(batch_size, 1),
                                requires_grad=False).cuda()
var_batch_r = autograd.Variable(torch.Tensor(batch_size, 1)).cuda()
var_batch_phi_next = autograd.Variable(torch.Tensor(batch_size, 4, 84,
                                                    84)).cuda()
var_batch_r_mask = autograd.Variable(torch.Tensor(batch_size, 1),
                                     requires_grad=False).cuda()

MP = MemoryReplay(memory_size, batch_size)
dqn = DQN()
target_dqn = DQN()
target_dqn.load_state_dict(dqn.state_dict())

dqn.cuda()
target_dqn.cuda()

optimz = optim.RMSprop(dqn.parameters(),
                       lr=0.0025,
                       alpha=0.9,
                       eps=1e-02,
                       momentum=0.0)

pong = Pong()

for i in range(memory_size):
    phi = pong.current_phi
    act_index = random.randrange(3)
    phi_next, r, done = pong.step(VALID_ACTION[act_index])
    pong.display()
Пример #12
0
class Agent:
    def __init__(self, time_step, split, lr):
        self.dataset = Dataset(T=time_step,
                               split_ratio=split,
                               binary_file=config.BINARY_DATASET)
        self.policy_net_encoder = AttnEncoder(
            input_size=self.dataset.get_num_features(),
            hidden_size=config.ENCODER_HIDDEN_SIZE,
            time_step=time_step)
        self.policy_net_decoder = AttnDecoder(
            code_hidden_size=config.ENCODER_HIDDEN_SIZE,
            hidden_size=config.DECODER_HIDDEN_SIZE,
            time_step=time_step)
        self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder)
        self.target_net_encoder = AttnEncoder(
            input_size=self.dataset.get_num_features(),
            hidden_size=config.ENCODER_HIDDEN_SIZE,
            time_step=time_step)
        self.target_net_decoder = AttnDecoder(
            code_hidden_size=config.ENCODER_HIDDEN_SIZE,
            hidden_size=config.DECODER_HIDDEN_SIZE,
            time_step=time_step)
        self.target_net = DQN(self.target_net_encoder, self.target_net_decoder)
        if torch.cuda.is_available():
            self.policy_net_encoder = self.policy_net_encoder.cuda()
            self.policy_net_decoder = self.policy_net_decoder.cuda()
            self.target_net_encoder = self.target_net_encoder.cuda()
            self.target_net_decoder = self.target_net_decoder.cuda()
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()
        self.memory = ReplayMemory(config.MEMORY_CAPACITY)
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)

    def select_action(self, state, test=False):
        global steps_done
        sample = random.random()
        eps_threshold = config.EPS_END + (
            config.EPS_START - config.EPS_END) * math.exp(
                -1. * steps_done / config.EPS_DECAY)
        steps_done += 1
        if sample > eps_threshold or test == True:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            if torch.cuda.is_available():
                return torch.tensor([[random.randint(3)]],
                                    dtype=torch.long).cuda()
            else:
                return torch.tensor([[random.randint(3)]], dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < config.BATCH_SIZE:
            return
        transitions = self.memory.sample(config.BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        state_batch = tuple([
            torch.cat(
                tuple([batch.state[i][j] for i in range(config.BATCH_SIZE)]))
            for j in range(3)
        ])
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = tuple([
            torch.cat(
                tuple(
                    [batch.next_state[i][j]
                     for i in range(config.BATCH_SIZE)])) for j in range(3)
        ])
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)
        next_state_values = self.target_net(next_state_batch).max(
            1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        config.GAMMA) + reward_batch
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            if param.grad is not None:
                param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def load_model(self, encoder_path=None, decoder_path=None, DQN_path=None):
        if (DQN_path != None):
            self.policy_net.load_state_dict(
                torch.load(DQN_path,
                           map_location=lambda storage, loc: storage))
            self.target_net.load_state_dict(self.policy_net.state_dict())
        else:
            self.policy_net_encoder.load_state_dict(
                torch.load(encoder_path,
                           map_location=lambda storage, loc: storage))
            self.policy_net_decoder.load_state_dict(
                torch.load(decoder_path,
                           map_location=lambda storage, loc: storage))
            self.policy_net = DQN(self.policy_net_encoder,
                                  self.policy_net_decoder)
            self.target_net.load_state_dict(self.policy_net.state_dict())

    def train(self, num_epochs, interval):
        env = Environment(np.array([0.5, 0.5]))
        episode = 0
        for epoch in range(num_epochs):
            env.reset()
            state = (env.x[env.current_step].unsqueeze(0),
                     env.y_seq[env.current_step].unsqueeze(0),
                     env.position.unsqueeze(0))
            while (1):
                action = self.select_action(state)
                _, next_state, reward = env.step(action.item())
                if (next_state == None):
                    break
                self.memory.push(state, action, next_state, reward)
                state = next_state
                self.optimize_model()
                episode += 1
                if (episode % config.TARGET_UPDATE == 0):
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
                print(env.wealth, action, env.position)
            if (epoch + 1) % (interval) == 0 or epoch + 1 == num_epochs:
                torch.save(self.policy_net.state_dict(),
                           'models/DQN' + str(epoch + 1) + '.model')

    def test(self, num_epochs):
        env = Environment(test=True)
        state = (env.x[env.current_step], env.y_seq[env.current_step],
                 env.position)
        while (1):
            action = self.select_action(state, test=True)
            _, next_state, _ = env.step(action.item())
            if (next_state == None):
                break
            state = next_state
            print(env.wealth)
Пример #13
0
    state_var = torch.autograd.Variable(state)
    target_var = torch.autograd.Variable(target)
    target_var.unsqueeze_(0)


    import copy
    learner = DQN()
    actor = DQN()

    for param in actor.parameters():
        param.requires_grad = False

    cuda = False
    if torch.cuda.is_available():
        cuda = True
        learner = learner.cuda(0)
        reward = reward.cuda(0)
    optimizer = torch.optim.SGD(learner.parameters(), lr=0.01)
    criterion = L2_loss(0.999)

    learner.train()
    for k in range(100):
        if cuda:
            x = learner(d_state_var.cuda(0))
        else:
            x = learner(d_state_var)
        actor.load_state_dict(learner.state_dict())
        # print(x)
        for i in range(10):
            # state_var = torch.autograd.Variable(torch.randn(1, 3, 40, 40))
            y = actor(state_var)
Пример #14
0
def main():
    global args, move_list, i_step
    args = parser.parse_args()


    #move_list = [x.__name__ for x in movement.__dict__.values()
    #    if inspect.isfunction(x)]
    #move_list.remove('focus')
    move_list=[]
    move_list.append('f_roll')
    move_list.append('idle')
    move_list.append('r_roll')
    move_list.append('l_roll')
    move_list.append('b_roll')
    move_list.append('light_atk')
    move_list.append('drink_estus')




    m = PyMouse(display=':0')
    k = PyKeyboard(display=':0')
    sct = mss(display=':0')

    env = DarkSoulsEnv(sct=sct, m=m, k=k)




    if args.pretrain:
        pass
    else:
        args.pretrain=None
    
    model1 = DQN(action=len(move_list), variables=3, pretrained=args.pretrain)
    if use_cuda:
        model1.cuda()

    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model1.parameters()])))

    optimizer = optim.Adam(model1.parameters(), lr=args.lr)
    #optimizer = optim.RMSprop(model.parameters())
    #optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)

    i_step = 0
    args.start_episode = 0
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_episode = checkpoint['episode']

            i_step = checkpoint['step']
            args.name = checkpoint['name']
            model1.load_state_dict(checkpoint['state_dict'])

            print("=> loaded checkpoint '{}' (epoch {})"
                      .format(args.resume, checkpoint['episode']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    model2 = copy.deepcopy(model1)



    train(model=model1, model2=model2, env=env, optimizer=optimizer)
Пример #15
0
class DDQNAgent:
    def __init__(self, config: Config, training=True):
        self.config = config
        self.is_training = training
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_shape, self.config.action_dim)
        self.target_model = DQN(self.config.state_shape,
                                self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())

        self.optim = Adam(self.model.parameters(),
                          lr=self.config.learning_rate)

        self.model.cuda()
        self.target_model.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learn(self, t):
        s, a, r, s2, done = self.buffer.sample(self.config.batch_size)

        s = torch.tensor(s, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        s2 = torch.tensor(s2, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        s = s.cuda()
        a = a.cuda()
        r = r.cuda()
        s2 = s2.cuda()
        done = done.cuda()

        q_values = self.model(s).cuda()
        next_q_values = self.model(s2).cuda()
        next_q_state_values = self.target_model(s2).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)

        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        if t % self.config.update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_checkpoint(self):
        os.makedirs('ckpt', exist_ok=True)
        torch.save(self.model.state_dict(), 'ckpt/model.pt')

    def load_checkpoint(self):
        self.model.load_state_dict('ckpt/model.pt')
        self.target_model.load_state_dict('ckpt/model.pt')
model_path = 'dqn3.pth'

if __name__ == '__main__':
    # Create carpole environment and network
    env = gym.make('CartPole-v0').unwrapped
    net = DQN(n_state=env.observation_space.shape[0],
              n_action=env.action_space.n,
              memory_size=memory_size,
              lr=lr,
              epsilon=epsilon,
              epsilon_decay=epsilon_decay,
              update_iter=update_iter,
              batch_size=batch_size,
              gamma=gamma,
              model_path=model_path)
    net.cuda()
    net.load()
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        while True:
            # env.render()
            # Select action and obtain the reward
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)

            # Record the total reward
            total_reward += r

            # Revised the reward
class DQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_dim, self.config.action_dim).cuda()
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):
        s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_value = next_q_values.max(1)[0]

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        # Notice that detach the expected_q_value
        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()


        return loss.item()

    def cuda(self):
        self.model.cuda()

    def load_weights(self, model_path):
        if model_path is None: return
        self.model.load_state_dict(torch.load(model_path))

    def save_model(self, output, tag=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")
class Agent():
    def __init__(self, n_actions, eps_start, eps_end, eps_steps, gamma, train,
                 cuda, batch_size):
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_steps = eps_steps
        self.gamma = gamma
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.steps_done = 0

        self.policy_net = DQN(
            n_actions)  # CHANGE THESE TWO LINES FOR TESTING ON CART POLE
        self.target_net = DQN(
            n_actions)  # CHANGE THESE TWO LINES FOR TESTING ON CART POLE
        if not train:
            self.policy_net.load_state_dict(torch.load('NetParameters.txt'))
        self.update_target_net()
        if cuda:
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()

        self.criterion = nn.MSELoss()
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        #self.optimizer=optim.Adam(self.policy_net.parameters(),0.001)

    def take_action(self, state):
        r = random.random()

        epsilon = self.eps_start - (
            (self.eps_start - self.eps_end) / self.eps_steps) * self.steps_done
        #epsilon=EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.steps_done / EPS_DECAY)

        self.steps_done += 1
        if epsilon < self.eps_end:
            epsilon = self.eps_end
        if r < epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            return self.policy_net(Variable(state.cuda(
            ), volatile=True)).data.max(1)[1][
                0]  #without [0] it was a long tensor of size 1,but env.step() takes a number,which is size 0

    def optimize_model(self, memory):
        if len(memory.memory) < self.batch_size:
            return
        transitions = memory.sample(self.batch_size)
        batch_state, batch_action, batch_next_state, batch_reward = zip(
            *transitions)
        batch_state = Variable(torch.cat(batch_state)).cuda()
        batch_action = Variable(torch.cat(batch_action)).cuda()
        batch_reward = Variable(torch.cat(batch_reward)).cuda()
        batch_next_state = Variable(torch.cat(batch_next_state)).cuda()

        current_q_values = self.policy_net(batch_state).gather(
            1, batch_action.unsqueeze(1)
        )  #action was 1 dimensional,dimensions need to match batch_state

        max_next_q_values = self.target_net(batch_next_state).detach().max(
            1)[0]
        expected_q_values = batch_reward + (self.gamma * max_next_q_values)

        loss = self.criterion(current_q_values, expected_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.steps_done % 400 == 0:  #update target net
            self.update_target_net()

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save(self):
        print("Cuva model")
        torch.save(self.policy_net.state_dict(), 'NetParameters.txt')
        print("Sacuvao ga je")