def train_convolutional_part(self, env, n_frames, print_state_every=100):
        self.current_model.mode_enc_dec = True
        # Take a random action
        action = self.current_model.act(state=None, epsilon=1.)
        state = env.reset()
        states_buffer = ReplayBuffer(capacity=1000)
        losses = []
        for i in range(n_frames):
            next_state, reward, done, _ = env.step(action)
            states_buffer.push(state, action, reward, next_state, done)

            if n_frames % 4 == 0:
                action = self.current_model.act(state=None, epsilon=1.)

            if done:
                print("Episode done during Encoder Decoder Training")
                state = env.reset()
            if len(states_buffer) > self.batch_size:
                # Train
                loss = self.compute_conv_loss(
                    states_buffer.state_sample(batch_size=self.batch_size))
                # Save the loss
                losses.append(loss.item())
            if i % print_state_every == 0 and len(losses) > 1:
                print("Training Encoder Decoder. Step:" + str(i) + "/" +
                      str(n_frames) + ". "
                      "Mean Loss: " +
                      str(np.round(np.mean(losses[-10:]), decimals=5)))
        for param in self.current_model.encoder.parameters():
            param.requires_grad = False
        self.current_model.mode_enc_dec = False
        self.update_target()
示例#2
0
    def __init__(self, gamma, epsilon, lr, n_actions=, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, chkpt_dir='tmp/dueling_ddqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = Network(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='lunar_lander_dueling_ddqn_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = Network(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='lunar_lander_dueling_ddqn_q_next',
                                   chkpt_dir=self.chkpt_dir)
示例#3
0
def main():
    policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
    policy_net.apply(init_weights)
    if pretrained:
        ckp = torch.load('/data2/jiangjigang/ckp/dqn.pth')
        policy_net.load_state_dict(
            {k.replace('module.', ''): v
             for k, v in ckp.items()})
    target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
    target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
    target_net.eval()
    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=learning_rate)  #定义优化器Adam,可以更换
    buffer = ReplayBuffer(
        buffer_size
    )  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
    criterion = torch.nn.MSELoss(reduction='sum')

    # training
    for i_episode in range(num_episodes):

        state0 = [user_loc, user_dis, node_loc, use_buff]  #获得一个初始化状态
        error = 0.0
        all_reward = 0
        for t in count():
            # 选择动作
            action = e_greedy_select_action(state0, policy_net)
            a = np.array([action.data.cpu().numpy()])
            #print("action selected by e_greedy is {}".format(action))
            # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
            state1, done, flag = transition_function(state0, action)
            # 利用奖励函数,获得当前的奖励值
            reward, cost_migration = reward_function(state0, action, state1,
                                                     flag)
            all_reward = all_reward + reward
            # 将经验数据存储至buffer中
            buffer.add(state0, a, reward, state1, done)

            # exit an episode after MAX_T steps
            if t > MAX_T:
                break

            #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。
            if i_episode > 1:

                # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定
                batch = buffer.getBatch(BATCH_SIZE)

                policy_net, target_net, bellman_error = optimize_model(
                    batch, policy_net, target_net, optimizer_policy, criterion)
                error = error + bellman_error.data.cpu().numpy()
            # 进入下一状态
            state0 = state1
        ave_error = error / (t * 1.00)
        ave_reward = all_reward / (t * 1.00)
        print(ave_error, ave_reward)
    torch.save(policy_net.state_dict(), '/data2/jiangjigang/ckp/dqn.pth')
    def __init__(self,
                 input_size,
                 num_actions,
                 gamma=DEFAULT_GAMMA,
                 buffer_size=DEFAULT_BUFFER_SIZE,
                 batch_size=DEFAULT_BATCH_SIZE,
                 load_from_path=None,
                 prepare_conv=False):
        """
        Include the double network and is in charge of train and manage it
        :param input_size:
        :param num_actions:
        :param buffer_size: int. Size of the replay buffer
        :param batch_size: int. Size of the Batch
        """
        # Instantiate both models
        net = Raimbow if len(input_size) == 3 else DQN
        self.current_model = net(input_size=input_size,
                                 num_actions=num_actions,
                                 prepare_decoder=prepare_conv)
        if load_from_path is not None:
            self.load_weights(path=load_from_path)
        self.target_model = net(input_size=input_size,
                                num_actions=num_actions,
                                prepare_decoder=prepare_conv)

        # Put them into the GPU if available
        if USE_CUDA:
            self.current_model = self.current_model.cuda()
            self.target_model = self.target_model.cuda()

        # Initialize the Adam optimizer and the replay buffer
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                           self.current_model.parameters()),
                                    lr=0.00001)
        self.replay_buffer = ReplayBuffer(capacity=buffer_size)

        # Make both networks start with the same weights
        self.update_target()

        # Save the rest of parameters
        self.batch_size = batch_size
        self.gamma = gamma
        self.input_channels = input_size
示例#5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
示例#6
0
    def __init__(self, env, config):
        """Initialize an Agent object.
        
        Params
        ======
            env : environment to be handled
            config : configuration given a variety of parameters
        """

        self.env = env
        self.config = config
        # self.seed = (config['seed'])

        # set parameter for ML
        self.set_parameters(config)
        # Replay memory
        self.memory = ReplayBuffer(config)
        # Q-Network
        self.create_agents(config)
        # load agent
        if self.load_model:
            self.load_agent('trained_tennis_2k86.pth')
示例#7
0
def train_DQN(env: WrapIt, Q: DQN, Q_target: DQN, optimizer: namedtuple,
              replay_buffer: ReplayBuffer, exploration: Schedule):
    """
    @parameters
        Q:
        Q_target:
        optimizer: torch.nn.optim.Optimizer with parameters
        buffer: store the frame
    @return
        None
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    optimizer = optimizer.constructor(Q.parameters(), **optimizer.kwargs)

    num_actions = env.action_space.n
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    LOG_EVERY_N_STEPS = 10000
    last_obs = env.reset(passit=True)

    # Q.getSummary()

    out_count = 0
    bar = tqdm(range(ARGS.timesteps))
    for t in bar:
        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()
        if t > ARGS.startepoch:
            value = select_epsilon_greedy_action(Q, recent_observations,
                                                 exploration, t, num_actions)
            action = value[0, 0]
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        reward = max(-1.0, min(reward, 1.0))
        replay_buffer.store_effect(last_idx, action, reward, done)

        if done:
            obs = env.reset()
        last_obs = obs
        # bar.set_description(f"{obs.shape} {obs.dtype}")

        if (t > ARGS.startepoch and t % ARGS.dqn_freq == 0
                and replay_buffer.can_sample(ARGS.batchsize)):
            bar.set_description("backward")
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             done_mask) = replay_buffer.sample(ARGS.batchsize)
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             not_done_mask) = TENSOR(obs_batch, act_batch, rew_batch,
                                     next_obs_batch, 1 - done_mask)
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             not_done_mask) = TO(obs_batch, act_batch, rew_batch,
                                 next_obs_batch, not_done_mask)

            values = Q(obs_batch)
            current_Q_values = values.gather(
                1,
                act_batch.unsqueeze(1).long()).squeeze()
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = Q_target(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            Q_target_values = rew_batch + (ARGS.gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = Q_target_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            # current_Q_values.backward(d_error.data.unsqueeze(1))
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            if num_param_updates % ARGS.dqn_updatefreq == 0:
                bar.set_description("update")
                Q_target.load_state_dict(Q.state_dict())
示例#8
0
import gym
import world
import utils
from Buffer import ReplayBuffer
from models import DQN
from world import Print, ARGS
from wrapper import WrapIt
from procedure import train_DQN

# ------------------------------------------------
env = gym.make('RiverraidNoFrameskip-v4')
env = WrapIt(env)
Print('ENV action', env.unwrapped.get_action_meanings())
Print('ENV observation', f"Image: {ARGS.imgDIM} X {ARGS.imgDIM} X {1}"
      )  # we assert to use gray image
# ------------------------------------------------
Optimizer = utils.getOptimizer()
schedule = utils.LinearSchedule(1000000, 0.1)

Game_buffer = ReplayBuffer(ARGS.buffersize, ARGS.framelen)

Q = utils.init_model(env, DQN).train().to(world.DEVICE)
Q_target = utils.init_model(env, DQN).eval().to(world.DEVICE)
# ------------------------------------------------
train_DQN(env,
          Q=Q,
          Q_target=Q_target,
          optimizer=Optimizer,
          replay_buffer=Game_buffer,
          exploration=schedule)
示例#9
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.total_reward = 0.0
        #self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward

        if self.total_reward > self.best_total_reward:
            self.best_total_reward = self.total_reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
示例#10
0
class Agent():
    def __init__(self, gamma, epsilon, lr, n_actions=, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, chkpt_dir='tmp/dueling_ddqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = Network(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='lunar_lander_dueling_ddqn_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = Network(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='lunar_lander_dueling_ddqn_q_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = torch.tensor([observation],dtype=torch.float).to(self.q_eval.device)
            _, advantage = self.q_eval.forward(state)
            action = torch.argmax(advantage).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                        if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = torch.tensor(state).to(self.q_eval.device)
        rewards = torch.tensor(reward).to(self.q_eval.device)
        dones = torch.tensor(done).to(self.q_eval.device)
        actions = torch.tensor(action).to(self.q_eval.device)
        states_ = torch.tensor(new_state).to(self.q_eval.device)

        indices = np.arange(self.batch_size)

        V_s, A_s = self.q_eval.forward(states)
        V_s_, A_s_ = self.q_next.forward(states_)

        V_s_eval, A_s_eval = self.q_eval.forward(states_)

        q_pred = torch.add(V_s,
                        (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions]
        q_next = torch.add(V_s_,
                        (A_s_ - A_s_.mean(dim=1, keepdim=True)))

        q_eval = torch.add(V_s_eval, (A_s_eval - A_s_eval.mean(dim=1,keepdim=True)))

        max_actions = torch.argmax(q_eval, dim=1)

        q_next[dones] = 0.0
        q_target = rewards + self.gamma*q_next[indices, max_actions]

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()
class Network:
    def __init__(self,
                 input_size,
                 num_actions,
                 gamma=DEFAULT_GAMMA,
                 buffer_size=DEFAULT_BUFFER_SIZE,
                 batch_size=DEFAULT_BATCH_SIZE,
                 load_from_path=None,
                 prepare_conv=False):
        """
        Include the double network and is in charge of train and manage it
        :param input_size:
        :param num_actions:
        :param buffer_size: int. Size of the replay buffer
        :param batch_size: int. Size of the Batch
        """
        # Instantiate both models
        net = Raimbow if len(input_size) == 3 else DQN
        self.current_model = net(input_size=input_size,
                                 num_actions=num_actions,
                                 prepare_decoder=prepare_conv)
        if load_from_path is not None:
            self.load_weights(path=load_from_path)
        self.target_model = net(input_size=input_size,
                                num_actions=num_actions,
                                prepare_decoder=prepare_conv)

        # Put them into the GPU if available
        if USE_CUDA:
            self.current_model = self.current_model.cuda()
            self.target_model = self.target_model.cuda()

        # Initialize the Adam optimizer and the replay buffer
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                           self.current_model.parameters()),
                                    lr=0.00001)
        self.replay_buffer = ReplayBuffer(capacity=buffer_size)

        # Make both networks start with the same weights
        self.update_target()

        # Save the rest of parameters
        self.batch_size = batch_size
        self.gamma = gamma
        self.input_channels = input_size

    def get_action(self, state):
        return self.current_model.act(state, epsilon=0.)

    def update_target(self):
        """
        Updates the target model with the weights of the current model
        """
        self.target_model.load_state_dict(self.current_model.state_dict())

    def compute_td_loss(self, samples):
        """
        Compute the loss of batch size samples of the buffer, and train the current model network with that loss
        :param samples: tuple of samples. Samples must have the format (state, action, reward, next_state, done)
        :return:
        float. Loss computed at this learning step
        """
        # Take N playing samples
        state, action, reward, next_state, done = samples

        # Transform them into torch variables, for being used on GPU during the training
        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))

        # Get the q value of this state and all the q values of the following step
        q_value = self.current_model(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        next_q_values = self.current_model(next_state)
        # Get the q values of the following step following the static policy of the target model
        next_q_state_values = self.target_model(next_state)
        # For all the q_values of the next state get the one of the action which would be selected by the static policy
        next_q_value = next_q_state_values.gather(
            1,
            torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)

        # Calculate the expected q value as the inmediate reward plus gamma by the expected reward at t+1 (if not ended)
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)

        # Calculate the Mean Square Error
        loss = nn.functional.smooth_l1_loss(q_value,
                                            Variable(expected_q_value.data))

        # Backpropagates the loss
        self.optimizer.zero_grad()
        loss.backward()
        # Learn
        self.optimizer.step()

        # Return the loss of this step
        return loss

    def compute_conv_loss(self, frames):
        """
        Compute the loss of batch size samples of the buffer, and train the current model network with that loss
        :param samples: tuple of samples. Samples must have the format (state, action, reward, next_state, done)
        :return:
        float. Loss computed at this learning step
        """

        # Transform them into torch variables, for being used on GPU during the training
        state = Variable(torch.FloatTensor(frames), requires_grad=True)

        loss = (state - self.current_model.forward(state)).pow(2).mean()

        # Backpropagates the loss
        self.optimizer.zero_grad()
        loss.backward()
        # Learn
        self.optimizer.step()

        # Return the loss of this step
        return loss

    def train_convolutional_part(self, env, n_frames, print_state_every=100):
        self.current_model.mode_enc_dec = True
        # Take a random action
        action = self.current_model.act(state=None, epsilon=1.)
        state = env.reset()
        states_buffer = ReplayBuffer(capacity=1000)
        losses = []
        for i in range(n_frames):
            next_state, reward, done, _ = env.step(action)
            states_buffer.push(state, action, reward, next_state, done)

            if n_frames % 4 == 0:
                action = self.current_model.act(state=None, epsilon=1.)

            if done:
                print("Episode done during Encoder Decoder Training")
                state = env.reset()
            if len(states_buffer) > self.batch_size:
                # Train
                loss = self.compute_conv_loss(
                    states_buffer.state_sample(batch_size=self.batch_size))
                # Save the loss
                losses.append(loss.item())
            if i % print_state_every == 0 and len(losses) > 1:
                print("Training Encoder Decoder. Step:" + str(i) + "/" +
                      str(n_frames) + ". "
                      "Mean Loss: " +
                      str(np.round(np.mean(losses[-10:]), decimals=5)))
        for param in self.current_model.encoder.parameters():
            param.requires_grad = False
        self.current_model.mode_enc_dec = False
        self.update_target()

    def epsilon_by_frame(self,
                         frame_idx,
                         epsilon_start=EPSILON_START,
                         epsilon_final=EPSILON_FINAL,
                         epsilon_decay=EPSILON_DECAY):
        """
        Gets the epsilon of the current frame for the given parameters
        :param frame_idx: int. Index of the frame
        :param epsilon_start: float. Epsilon at frame 1
        :param epsilon_final: float. Minimum epsilon for maintaining exploration
        :param epsilon_decay: int. Manages how fast the epsilon decays
        :return:
        Epsilon for the frame frame_idx
        """
        return epsilon_final + (epsilon_start - epsilon_final) * math.exp(
            -1. * frame_idx / epsilon_decay)

    def train(self,
              env,
              num_frames=DEFAULT_NUM_FRAMES,
              DQN_update_ratio=DEFAULT_DQN_UPDATE_RATIO,
              plotting_path=None,
              verbose=True,
              videos_to_save=DEFAULT_VIDEOS_TO_SAVE,
              train_conv_first=True,
              show=False):
        """
        Train the network in the given environment for an amount of frames
        :param env:
        :param num_frames:
        :return:
        """
        if train_conv_first:
            self.train_convolutional_part(env=env,
                                          n_frames=CONV_TRAINING_FRAMES)
        # Save the losses of the network and the rewards of each episode
        losses, all_rewards = [], []
        episode_reward = 0

        # Reset the game for starting the game from 0
        state = env.reset()
        actions_taken = []
        for i in range(MIN_RANDOM_ACTIONS):
            action = self.current_model.act(state, epsilon=1.)
            next_state, reward, done, _ = env.step(action)
            self.replay_buffer.push(state, action, reward, next_state, done)
            state = next_state
            if done:
                env.reset()
        for frame_idx in range(1, num_frames + 1):
            # Gets an action for the current state having in account the current epsilon
            action = self.current_model.act(
                state, epsilon=self.epsilon_by_frame(frame_idx=frame_idx))
            actions_taken.append(action)
            if show:
                env.render()
            # Execute the action, capturing the new state, the reward and if the game is ended or not
            next_state, reward, done, _ = env.step(action)
            # Save the action at the replay buffer
            self.replay_buffer.push(state, action, reward, next_state, done)
            # Update the current state and the actual episode reward
            state = next_state
            episode_reward += reward

            # If a game is finished save the results of that game and restart the game
            if done:
                print("Episode Reward: " + str(episode_reward) + ".  "
                      "Std of actions: " +
                      str(np.round(np.std(actions_taken), decimals=4)) + ". "
                      "Epsilon " + str(
                          np.round(self.epsilon_by_frame(frame_idx=frame_idx),
                                   decimals=3)))
                actions_taken = []
                all_rewards.append(episode_reward)
                state, episode_reward = env.reset(), 0

            # If there are enough actions in the buffer for learning, start to learn a policy
            if frame_idx % ACTIONS_PER_TRAIN_STEP == 0:
                # Train
                loss = self.compute_td_loss(
                    self.replay_buffer.sample(self.batch_size))
                # Save the loss
                losses.append(loss.item())

            if plotting_path is not None and frame_idx % PLOT_EVERY == 0:
                save_plot(frame_idx,
                          all_rewards,
                          losses,
                          path_to_save=plotting_path)

            if frame_idx % DQN_update_ratio == 0:
                self.update_target()

                if verbose and frame_idx % DQN_update_ratio == 0:
                    print(
                        env.unwrapped.spec.id + ' Training: ' +
                        str(frame_idx) + '/' + str(num_frames) + '. '
                        'Mean Rewards: ' +
                        str(np.round(np.mean(all_rewards[-10:]), decimals=2)))

            if frame_idx % (num_frames // videos_to_save) == 0:
                save_video(env=env,
                           policy=self,
                           path=os.path.join(plotting_path, VIDEOS_DIR_NAME,
                                             'During Training',
                                             str(len(all_rewards)) + ' Games'))
                env.reset()

    def save(self, path):
        if not os.path.isdir(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.current_model.cpu()
        torch.save(self.current_model.state_dict(), path)
        if USE_CUDA:
            self.current_model.cuda()

    def load_weights(self, path):
        if not os.path.isfile(path):
            warnings.warn("Trying to charge non existent weights. Skipping")
        else:
            self.current_model.cpu()
            output_state_dict = torch.load(path)
            new_dict = {
                key:
                (output_state_dict[key] if key in output_state_dict else value)
                for key, value in self.current_model.state_dict().items()
            }
            self.current_model.load_state_dict(new_dict)

            for param in self.current_model.parameters():
                param.requires_grad = False

            for param in self.current_model.pre_output.parameters():
                param.requires_grad = True
            for param in self.current_model.output.parameters():
                param.requires_grad = True

            if USE_CUDA:
                self.current_model.cuda()
        return self.current_model
示例#12
0
user_loc = np.random.randint(0, 101, U_num).tolist()  #用户位置 1-100号
user_dis = random_displacement(user_loc)  #用户未来位移 上下左右 -10,10,-1,1
use_buff = np.random.randint(3, 8, U_num).tolist()  #资源所需
state0 = [user_loc, user_dis, node_loc, use_buff]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#主程序部分

policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
target_net.eval()
optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=learning_rate)  #定义优化器Adam,可以更换
buffer = ReplayBuffer(
    buffer_size)  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
criterion = torch.nn.MSELoss(reduction='sum')

# training
for i_episode in range(num_episodes):

    #state0 #获得一个初始化状态

    for t in count():
        # 选择动作
        action = e_greedy_select_action(state0)
        print("action selected by e_greedy is {}".format(action))
        # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
        state1, done, flag = transition_function(state0, action)
        # 利用奖励函数,获得当前的奖励值
        reward, cost_migration = reward_function(state0, action, state1, flag)
示例#13
0
class maddpg():
    """Interacts with and learns from the environment."""
    def __init__(self, env, config):
        """Initialize an Agent object.
        
        Params
        ======
            env : environment to be handled
            config : configuration given a variety of parameters
        """

        self.env = env
        self.config = config
        # self.seed = (config['seed'])

        # set parameter for ML
        self.set_parameters(config)
        # Replay memory
        self.memory = ReplayBuffer(config)
        # Q-Network
        self.create_agents(config)
        # load agent
        if self.load_model:
            self.load_agent('trained_tennis_2k86.pth')

    def set_parameters(self, config):
        # Base agent parameters
        self.gamma = config['gamma']  # discount factor
        self.tau = config['tau']
        self.max_episodes = config[
            'max_episodes']  # max numbers of episdoes to train
        self.env_file_name = config[
            'env_file_name']  # name and path for env app
        self.brain_name = config[
            'brain_name']  # name for env brain used in step
        self.train_mode = config['train_mode']
        self.load_model = config['load_model']
        self.save_model = config['save_model']
        self.num_agents = config['num_agents']
        self.state_size = config['state_size']
        self.action_size = config['action_size']
        self.hidden_size = config['hidden_size']
        self.buffer_size = config['buffer_size']
        self.batch_size = config['batch_size']
        self.learn_every = config['learn_every']
        self.learn_num = config['learn_num']
        self.critic_learning_rate = config['critic_learning_rate']
        self.actor_learning_rate = config['actor_learning_rate']
        self.noise_decay = config['noise_decay']
        self.seed = (config['seed'])
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)
        random.seed(self.seed)
        self.noise_scale = 1
        self.results = struct_class()
        # Some Debug flags
        self.debug_show_memory_summary = False

    def create_agents(self, config):
        self.maddpg_agent = [ddpg_agent(config), ddpg_agent(config)]

        for a_i in range(self.num_agents):
            self.maddpg_agent[a_i].id = a_i

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        # print('Step adding types') # : ,states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape)
        actions = np.reshape(actions, (1, 2 * self.action_size))
        self.memory.add(states, actions, rewards, next_states, dones)

    def act(self, state):
        """Returns actions for given state as per current policy 
        shuold only get single or single joined states from train"""
        state = ten(state)
        actions = np.vstack([agent.act(state) for agent in self.maddpg_agent])
        return actions

    def actor_target(self, states):
        """Returns actions for given state as per current target_policy without noise.
           should only get batch_size states from learn"""
        actions = np.hstack([agent.act(states) for agent in self.maddpg_agent])
        return ten(actions)

    def init_results(self):
        """ Keeping different results in list in self.results, being initializd here"""
        self.results.reward_window = deque(maxlen=100)
        self.results.all_rewards = []
        self.results.avg_rewards = []
        self.results.critic_loss = []
        self.results.actor_loss = []

    def episode_reset(self, i_episode):
        self.noise_reset()
        self.episode = i_episode
        self.noise_scale *= self.noise_decay
        for agent in self.maddpg_agent:
            agent.noise_scale = self.noise_scale
            agent.episode = self.episode

    def noise_reset(self):
        for agent in self.maddpg_agent:
            agent.noise.reset()

    def train(self):
        print('Running on device : ', device)
        # if False:
        #     filename = 'trained_reacher_a_e100.pth'
        #     self.load_agent(filename)
        self.init_results()
        # training loop
        # show progressbar
        widget = [
            'episode: ',
            pb.Counter(), '/',
            str(self.max_episodes), ' ',
            pb.Percentage(), ' ',
            pb.ETA(), ' ',
            pb.Bar(marker=pb.RotatingMarker()), ' '
        ]

        timer = pb.ProgressBar(widgets=widget,
                               maxval=self.max_episodes).start()

        for i_episode in range(self.max_episodes):
            timer.update(i_episode)
            tic = time.time()

            # per episode resets
            self.episode_reset(i_episode)
            total_reward = np.zeros(self.num_agents)
            # Reset the enviroment
            env_info = self.env.reset(
                train_mode=self.train_mode)[self.brain_name]
            states = self.get_states(env_info)
            t = 0
            dones = np.zeros(self.num_agents, dtype=bool)

            # loop over episode time steps
            while not any(dones):
                # act and collect data
                actions = self.act(states)
                env_info = self.env.step(actions)[self.brain_name]
                next_states = self.get_states(env_info)
                rewards = env_info.rewards
                dones = env_info.local_done
                # increment stuff
                t += 1
                total_reward += rewards
                # np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
                # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,actions,rewards,dones))
                # Proceed agent step
                self.step(states, actions, rewards, next_states, dones)
                # prepare for next round
                states = next_states
            #:while not done
            # Learn, if enough samples are available in memory
            if (i_episode % self.learn_every == 0):
                if len(self.memory) > self.batch_size:
                    for l in range(self.learn_num):
                        experiences = self.memory.sample()
                        self.learn(experiences)

            toc = time.time()
            # keep track of rewards:
            self.results.all_rewards.append(total_reward)
            self.results.avg_rewards.append(np.mean(
                self.results.reward_window))
            self.results.reward_window.append(np.max(total_reward))
            # Output Episode info :
            self.print_episode_info(total_reward, t, tic, toc)
        # for i_episode

        if self.save_model:
            filename = 'trained_tennis' + str(self.seed) + '.pth'
            self.save_agent(filename)

        return self.results

    def get_states(self, env_info):
        return np.reshape(env_info.vector_observations,
                          (1, 2 * self.state_size))

    def print_episode_info(self, total_reward, num_steps, tic, toc):
        if (self.episode % 20 == 0) or (np.max(total_reward) > 0.01):
            if np.max(total_reward) > 0.01:
                if np.sum(total_reward) > 0.15:
                    if np.sum(total_reward) > 0.25:
                        StyleString = Back.GREEN
                        print('Double Hit')
                    else:
                        StyleString = Back.BLUE
                else:
                    StyleString = Back.RED
            else:
                StyleString = ''
            print(
                StyleString +
                'Episode {} with {} steps || Reward : {} || avg reward : {:6.3f} || Noise {:6.3f} || {:5.3f} seconds, mem : {}'
                .format(self.episode, num_steps, total_reward,
                        np.mean(self.results.reward_window), self.noise_scale,
                        toc - tic, len(self.memory)))
            print(Style.RESET_ALL, end='')

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        q_target = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value """

        states, actions, rewards, next_states, dones = experiences
        # print('Learning shape : ',states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape)
        # print('Learning state & reward shape : ',states[0].shape,rewards[0].shape)

        actor_loss = []
        critic_loss = []
        both_next_actions = self.actor_target(next_states)

        # print('Learn both',both_next_actions.shape)
        for agent in self.maddpg_agent:
            # In case of joined_states, we want actions_next from both agents for learning
            al, cl = agent.learn(states, actions, rewards, next_states,
                                 both_next_actions, dones)
            actor_loss.append(al)
            critic_loss.append(cl)

        self.results.actor_loss.append(actor_loss)
        self.results.critic_loss.append(critic_loss)

    def save_agent(self, filename):
        states, actions, rewards, next_states, dones = self.memory.save_buffer(
        )
        print('save agent : ', states.shape, actions.shape, rewards.shape,
              next_states.shape, dones.shape)
        torch.save(
            {
                'critic_local0':
                self.maddpg_agent[0].critic_local.state_dict(),
                'critic_target0':
                self.maddpg_agent[0].critic_target.state_dict(),
                'actor_local0': self.maddpg_agent[0].actor_local.state_dict(),
                'actor_target0':
                self.maddpg_agent[0].actor_target.state_dict(),
                'critic_local1':
                self.maddpg_agent[1].critic_local.state_dict(),
                'critic_target1':
                self.maddpg_agent[1].critic_target.state_dict(),
                'actor_local1': self.maddpg_agent[1].actor_local.state_dict(),
                'actor_target1':
                self.maddpg_agent[1].actor_target.state_dict(),
                'memory': (states, actions, rewards, next_states, dones),
            }, filename)
        print('Saved Networks and ER-memory in ', filename)
        return

    def load_agent(self, filename):
        savedata = torch.load(filename)
        self.maddpg_agent[0].critic_local.load_state_dict(
            savedata['critic_local0'])
        self.maddpg_agent[0].critic_target.load_state_dict(
            savedata['critic_target0'])
        self.maddpg_agent[0].actor_local.load_state_dict(
            savedata['actor_local0'])
        self.maddpg_agent[0].actor_target.load_state_dict(
            savedata['actor_target0'])
        self.maddpg_agent[1].critic_local.load_state_dict(
            savedata['critic_local1'])
        self.maddpg_agent[1].critic_target.load_state_dict(
            savedata['critic_target1'])
        self.maddpg_agent[1].actor_local.load_state_dict(
            savedata['actor_local1'])
        self.maddpg_agent[1].actor_target.load_state_dict(
            savedata['actor_target1'])
        states, actions, rewards, next_states, dones = savedata['memory']
        self.memory.load_buffer(states, actions, rewards, next_states, dones)
        print('Memory loaded with length : ', len(self.memory))
        return