Пример #1
0
def train():
    memory = ReplayMemory(memory_size)
    fill_memory(memory, env, batch_size)
    explorer = ExpExplorer(explore_start, explore_stop, decay_rate)
    dqn = DQN(state_shape=env.observation_space.shape,
              n_actions=(env.action_space.n), lr=lr)
    dqn.model.summary()

    rewards_list = []
    loss = 1
    for episode in range(5000):
        episode_rewards = 0
        state = env.reset()
        done = False
        while not done:
            action, explore_probability = predict_action(dqn.model,
                                                         explorer,
                                                         state,
                                                         env.action_space.n)

            next_state, reward, done, _ = env.step(action)
            # env.render()
            episode_rewards += reward
            memory.push(state, action, reward, next_state, done)
            state = next_state
            loss = learn(dqn.model, memory).history['loss']

            if done:
                rewards_list.append(episode_rewards)
                moving_average = np.mean(rewards_list[-100:])
                if episode % 10 == 0:
                    print('Episode: {}'.format(episode),
                          'Total reward: {}'.format(episode_rewards),
                          'Explore P: {:.4f}'.format(explore_probability),
                          'Training Loss {}'.format(loss),
                          'Moving average {}'.format(moving_average))

        if episode % 10 == 0:
            dqn.model.save(PATH)
def train():
    memory = ReplayMemory(memory_size)
    fill_memory(memory, env, batch_size)
    explorer = ExpExplorer(explore_start, explore_stop, decay_rate)
    dqn = DQN(state_shape=env.observation_space.shape[0],
              n_actions=env.action_space.n).to(device)

    criterion = torch.nn.MSELoss().to(device)
    optimizer = torch.optim.Adam(dqn.parameters(), lr=lr)

    rewards_list = []

    for episode in range(5000):
        episode_rewards = 0
        state = env.reset()
        done = False
        while not done:
            action, explore_probability = predict_action(
                dqn, explorer, state, env.action_space.n)

            next_state, reward, done, _ = env.step(action)
            # env.render()
            episode_rewards += reward
            memory.push(state, action, reward, next_state, done)
            state = next_state
            loss = learn(dqn, memory, criterion, optimizer)

            if done:
                rewards_list.append(episode_rewards)
                moving_average = np.mean(rewards_list[-100:])
                if episode % 50 == 0:
                    print('Episode: {}'.format(episode),
                          'Total reward: {}'.format(episode_rewards),
                          'Explore P: {:.4f}'.format(explore_probability),
                          'Training Loss {}'.format(loss),
                          'Moving average {}'.format(moving_average))

        if episode % 100 == 0:
            torch.save(dqn.state_dict(), MODEL_PATH)
Пример #3
0
    def __init__(self, env, model, policy,
                 ## hyper-parameter
                 gamma=0.90, lr=1e-3, batch_size=32, buffer_size=50000, learning_starts=1000,
                 target_network_update_freq=1000,
                 ## decay
                 decay=False, decay_rate=0.9,
                 ## DDqn && DuelingDQN
                 double_dqn=True, dueling_dqn=False, dueling_way="native",
                 ## prioritized_replay
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6, param_noise=False,
                 ##
                 path=None):

        """

        :param env:      the GYM environment
        :param model:    the Torch NN model
        :param policy:   the policy when choosing action
        :param ep:       the MAX episode time
        :param step:     the MAx step time
         .........................hyper-parameter..................................
        :param gamma:
        :param lr:
        :param batchsize:
        :param buffer_size:
        :param target_network_update_freq:
        .........................further improve way..................................
        :param double_dqn:  whether enable DDQN
        :param dueling_dqn: whether dueling DDQN
        :param dueling_way: the Dueling DQN method
            it can choose the following three ways
            `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta)
        .........................prioritized-part..................................
        :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
        :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
        :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
        :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
        :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
        .........................imitation_learning_part..................................
        :param imitation_learning_policy:     To initial the network with the given policy
        which is supervised way to training the network
        :param IL_time:    supervised training times
        :param network_kwargs:
        """

        self.env = env
        self.policy = policy

        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.target_network_update_freq = target_network_update_freq
        self.double_dqn = double_dqn

        if dueling_dqn:
            self.Q_net = Dueling_dqn(model, dueling_way)
        else:
            self.Q_net = model

        self.target_Q_net = deepcopy(self.Q_net)

        q_net_optim = Adam(self.Q_net.parameters(), lr=lr)
        if decay:
            self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1)
        else:
            self.optim = q_net_optim

        self.replay_buffer = ReplayMemory(buffer_size)
        self.learning = False
        super(DQN_Agent, self).__init__(path)
Пример #4
0
class DQN_Agent(Agent):
    def __init__(self, env, model, policy,
                 ## hyper-parameter
                 gamma=0.90, lr=1e-3, batch_size=32, buffer_size=50000, learning_starts=1000,
                 target_network_update_freq=1000,
                 ## decay
                 decay=False, decay_rate=0.9,
                 ## DDqn && DuelingDQN
                 double_dqn=True, dueling_dqn=False, dueling_way="native",
                 ## prioritized_replay
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6, param_noise=False,
                 ##
                 path=None):

        """

        :param env:      the GYM environment
        :param model:    the Torch NN model
        :param policy:   the policy when choosing action
        :param ep:       the MAX episode time
        :param step:     the MAx step time
         .........................hyper-parameter..................................
        :param gamma:
        :param lr:
        :param batchsize:
        :param buffer_size:
        :param target_network_update_freq:
        .........................further improve way..................................
        :param double_dqn:  whether enable DDQN
        :param dueling_dqn: whether dueling DDQN
        :param dueling_way: the Dueling DQN method
            it can choose the following three ways
            `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta)
        .........................prioritized-part..................................
        :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
        :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
        :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
        :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
        :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
        .........................imitation_learning_part..................................
        :param imitation_learning_policy:     To initial the network with the given policy
        which is supervised way to training the network
        :param IL_time:    supervised training times
        :param network_kwargs:
        """

        self.env = env
        self.policy = policy

        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.target_network_update_freq = target_network_update_freq
        self.double_dqn = double_dqn

        if dueling_dqn:
            self.Q_net = Dueling_dqn(model, dueling_way)
        else:
            self.Q_net = model

        self.target_Q_net = deepcopy(self.Q_net)

        q_net_optim = Adam(self.Q_net.parameters(), lr=lr)
        if decay:
            self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1)
        else:
            self.optim = q_net_optim

        self.replay_buffer = ReplayMemory(buffer_size)
        self.learning = False
        super(DQN_Agent, self).__init__(path)

    def forward(self, observation):
        observation = observation.astype(np.float32)
        observation = torch.from_numpy(observation)
        Q_value = self.Q_net.forward(observation)
        Q_value = Q_value.detach().numpy()
        if self.policy is not None:
            action = self.policy.select_action(Q_value)
        else:
            action = np.argmax(Q_value)
        return action, Q_value

    def backward(self, sample_):
        self.replay_buffer.push(sample_)
        if self.step > self.learning_starts and self.learning:
            sample = self.replay_buffer.sample(self.batch_size)
            assert len(sample["s"]) == self.batch_size
            a = sample["a"].long().unsqueeze(1)
            Q = self.Q_net(sample["s"]).gather(1, a)
            if self.double_dqn:
                _, next_actions = self.Q_net(sample["s_"]).max(1, keepdim=True)
                targetQ = self.target_Q_net(sample["s_"]).gather(1, next_actions)
            else:
                targetQ = self.target_Q_net(sample["s_"]).max(1, keepdim=True)
            targetQ = targetQ.squeeze(1)
            Q = Q.squeeze(1)
            expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"])
            loss = torch.mean(huber_loss(expected_q_values-Q))
            self.Q_net.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.Q_net.parameters(), 1, norm_type=2)
            self.optim.step()
            if self.step % self.target_network_update_freq == 0:
                self.target_net_update()
            loss = loss.data.numpy()
            return loss
        return 0

    def target_net_update(self):
        self.target_Q_net.load_state_dict(self.Q_net.state_dict())

    def load_weights(self, filepath):
        model = torch.load(filepath)
        self.Q_net.load_state_dict(model["Q_net"])
        self.target_Q_net.load_state_dict(model["target_Q_net"])
        self.optim.load_state_dict(model["optim"])

    def save_weights(self, filepath, overwrite=False):
        torch.save({"Q_net": self.Q_net,
                    "target_Q_net": self.target_Q_net,
                    "optim": self.optim
                    }, filepath+"DQN.pkl")
def train():
    memory = ReplayMemory(memory_size)
    fill_memory(memory)
    print('finished filling memory')
    explorer = LinearExplorer(1, 0.1, 1000000, 0.01, 24000000)
    dqn = DQN(state_shape=PROCESSED_FRAME_SIZE,
              n_actions=env.action_space.n).to(device)
    target_dqn = DQN(state_shape=PROCESSED_FRAME_SIZE,
                     n_actions=env.action_space.n).to(device)

    criterion = torch.nn.SmoothL1Loss().to(device)
    optimizer = torch.optim.Adam(dqn.parameters(), lr=lr)

    frame_stack = FrameStack(4, PROCESSED_FRAME_SIZE)
    rewards_list = []
    total_steps = 0

    ts_frame = 0
    ts = time.time()

    for episode in range(episodes_train):

        episode_rewards = 0
        losses = []
        state = env.reset()
        state = frame_stack.push_get(process_frame(state), True)
        done = False

        while not done:
            action, explore_probability = predict_action(dqn,
                                                         explorer,
                                                         state,
                                                         env.action_space.n,
                                                         total_steps)

            next_state, reward, done, _ = env.step(action)
            next_state = frame_stack.push_get(process_frame(next_state))

            # env.render()
            episode_rewards += reward
            memory.push(state, action, reward, next_state, done)
            state = next_state

            if total_steps % update_frequency == 0:
                loss = learn(dqn, target_dqn, memory, criterion, optimizer)
                losses.append(loss.item())

            if done:
                speed = (total_steps - ts_frame) / (time.time() - ts)
                ts_frame = total_steps
                ts = time.time()

                rewards_list.append(episode_rewards)

                print('Episode: {}'.format(episode),
                      'Total reward: {}'.format(episode_rewards),
                      'Explore P: {:.4f}'.format(explore_probability),
                      'Training Loss {}'.format(np.mean(losses)),
                      'total steps {}'.format(total_steps),
                      'speed {} frames/sec'.format(speed))

            if total_steps % target_net_update_freq == 0:
                target_dqn.load_state_dict(dqn.state_dict())

            total_steps += 1

        if episode % 100 == 0:
            torch.save(dqn.state_dict(), MODEL_PATH)
def train():
    memory = ReplayMemory(memory_size)
    fill_memory(memory)
    print('finished filling memory')
    explorer = LinearExplorer(1, 0.1, 100000, 0.01, 1000000)
    dqn = DQN(state_shape=env.observation_space.shape[0],
              n_actions=env.action_space.n).to(device)
    target_dqn = DQN(state_shape=env.observation_space.shape[0],
                     n_actions=env.action_space.n).to(device)

    criterion = torch.nn.SmoothL1Loss().to(device)
    optimizer = torch.optim.Adam(dqn.parameters(), lr=lr)

    latest_rewards = deque([], maxlen=100)
    total_steps = 0

    ts_frame = 0
    ts = time.time()

    for episode in range(episodes_train):

        episode_rewards = 0
        losses = []
        state = env.reset()
        done = False

        while not done:
            action, explore_probability = predict_action(
                dqn, explorer, state, env.action_space.n, total_steps)

            next_state, reward, done, _ = env.step(action)

            # env.render()
            episode_rewards += reward
            memory.push(state, action, reward, next_state, done)
            state = next_state

            if total_steps % update_frequency == 0:
                loss = learn(dqn, target_dqn, memory, criterion, optimizer)
                losses.append(loss.item())

            if done:
                speed = (total_steps - ts_frame) / (time.time() - ts)
                ts_frame = total_steps
                ts = time.time()

                latest_rewards.append(episode_rewards)

                print('Episode: {}'.format(episode),
                      'reward: {}'.format(episode_rewards),
                      'explore P: {:.4f}'.format(explore_probability),
                      'loss: {:.4f}'.format(np.mean(losses)),
                      'steps: {}'.format(total_steps),
                      'speed: {:.1f} frames/sec'.format(speed),
                      'average 100: {:.2f}'.format(np.mean(latest_rewards)))

            if total_steps % target_net_update_freq == 0:
                target_dqn.load_state_dict(dqn.state_dict())

            total_steps += 1

        if episode % 10 == 0:
            torch.save(dqn.state_dict(), MODEL_PATH)