def __init__(self, state_size, action_size, seed=0, mode='DQN', use_prioritized_memory=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.use_prioritized_memory = use_prioritized_memory # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(DEVICE) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(DEVICE) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.mode = mode print('Q Network') print(self.qnetwork_local) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.train_steps = 0
def __init__(self, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, lr=LR, epsilon=EPSILON): # local network for estimate # target network for computing target self.batch_size = batch_size self.epsilon = epsilon self.network_loc = QNetwork() self.network_targ = QNetwork() setWeights(self.network_loc, self.network_targ) self.optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.buffer = ReplayBuffer(buffer_size=buffer_size, batch_size=batch_size) self.actions = actionlst() self.context_extractor = ContextExtractor() self.state_target = None # Build model so it knows the input shape self.network_loc.build(tf.TensorShape([ None, STATE_LENGTH, ])) self.network_targ.build(tf.TensorShape([ None, STATE_LENGTH, ]))
def __init__(self, parmas): self.num_actions = params['num_actions'] self.device = params['device'] self.path_model = params['path_model'] self.policy_net = QNetwork(self.num_actions).to(self.device) self.policy_net.load_state_dict( torch.load(self.path_model, map_location=self.device)) self.policy_net.eval()
def __init__(self, state_size, action_size, seed, config): self.state_size = state_size self.action_size = action_size self.config = config self.seed = random.seed(seed) self.local_q_net = QNetwork(state_size, action_size, seed).to(device) self.target_q_net = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_q_net.parameters(), lr=config["LR"]) self.memory = ReplayBuffer(action_size, config["BUFFER_SIZE"], config["BATCH_SIZE"], seed) self.t_step = 0
class TrainedBrain(): def __init__(self, parmas): self.num_actions = params['num_actions'] self.device = params['device'] self.path_model = params['path_model'] self.policy_net = QNetwork(self.num_actions).to(self.device) self.policy_net.load_state_dict( torch.load(self.path_model, map_location=self.device)) self.policy_net.eval() def decide_action(self, state): with torch.no_grad(): self.q_vals = self.policy_net( torch.from_numpy(state.copy()).float().to( self.device).unsqueeze(0)) return int(self.q_vals.max(1)[1].view(1, 1))
def __init__(self, state_space: int, action_num: int, action_scale: int, learning_rate, device: str): super(BQN, self).__init__() self.q = QNetwork(state_space, action_num, action_scale).to(device) self.target_q = QNetwork(state_space, action_num, action_scale).to(device) self.target_q.load_state_dict(self.q.state_dict()) self.optimizer = optim.Adam([\ {'params' : self.q.linear_1.parameters(),'lr': learning_rate / (action_num+2)},\ {'params' : self.q.linear_2.parameters(),'lr': learning_rate / (action_num+2)},\ {'params' : self.q.value.parameters(), 'lr' : learning_rate/ (action_num+2)},\ {'params' : self.q.actions.parameters(), 'lr' : learning_rate},\ ]) self.update_freq = 1000 self.update_count = 0
def __init__(self, params): self.num_actions = params['num_actions'] self.device = params['device'] self.batch_size = params['batch_size'] self.learning_rate = params['learning_rate'] self.gamma = params['gamma'] self.eps_start = params['eps_start'] self.eps_end = params['eps_end'] self.eps_decay = params['eps_decay'] self.policy_net = QNetwork(self.num_actions).to(self.device) self.target_net = QNetwork(self.num_actions).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(params['replay_memory_size']) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) self.steps_done = 0 self.q_vals = [0] * self.num_actions self.loss = 0
def __init__(self, state_size, action_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = QNetwork(state_size, action_size).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, game_name="pong"): self.LEARNING_RATE = 1e-4 self.eps_init = 1 self.eps_final = 0.02 self.schedule_timesteps = 5e4 # tot_timestep * explo_frac self.eps = self.eps_init self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.TARGET_UPDATE_C = 1000 self.MEMORY_CAPACITY_N = 200 self.episode_M = 5000 self.GAME_ENV = { "pong": "PongNoFrameskip-v4", "cartpole": "CartPole-v0", } self.game_name = game_name env_name = self.GAME_ENV[self.game_name] self.env = wrapEnv(gym.make(env_name)) self.reward_list = [] self.Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) self.Qnetwork.summary()
class BQN(nn.Module): def __init__(self, state_space: int, action_num: int, action_scale: int, learning_rate, device: str): super(BQN, self).__init__() self.q = QNetwork(state_space, action_num, action_scale).to(device) self.target_q = QNetwork(state_space, action_num, action_scale).to(device) self.target_q.load_state_dict(self.q.state_dict()) self.optimizer = optim.Adam([\ {'params' : self.q.linear_1.parameters(),'lr': learning_rate / (action_num+2)},\ {'params' : self.q.linear_2.parameters(),'lr': learning_rate / (action_num+2)},\ {'params' : self.q.value.parameters(), 'lr' : learning_rate/ (action_num+2)},\ {'params' : self.q.actions.parameters(), 'lr' : learning_rate},\ ]) self.update_freq = 1000 self.update_count = 0 def action(self, x): return self.q(x) def train_mode(self, n_epi, memory, batch_size, gamma, use_tensorboard, writer): state, actions, reward, next_state, done_mask = memory.sample( batch_size) actions = torch.stack(actions).transpose(0, 1).unsqueeze(-1) done_mask = torch.abs(done_mask - 1) cur_actions = self.q(state) cur_actions = torch.stack(cur_actions).transpose(0, 1) cur_actions = cur_actions.gather(2, actions.long()).squeeze(-1) target_cur_actions = self.target_q(next_state) target_cur_actions = torch.stack(target_cur_actions).transpose(0, 1) target_cur_actions = target_cur_actions.max(-1, keepdim=True)[0] target_action = (done_mask * gamma * target_cur_actions.mean(1) + reward) loss = F.mse_loss(cur_actions, target_action.repeat(1, 4)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update_count += 1 if (self.update_count % self.update_freq == 0) and (self.update_count > 0): self.update_count = 0 self.target_q.load_state_dict(self.q.state_dict()) if use_tensorboard: writer.add_scalar("Loss/loss", loss, n_epi) return loss
def train(): env.reset() _, reward, done, _ = env.step(env.action_space.sample()) state = get_state() memory = Memory(max_size=memory_size) for _ in range(pretrain_length): action = env.action_space.sample() _, reward, done, _ = env.step(action) next_state = get_state() if done: next_state = np.zeros(state.shape) memory.add((state, action, reward, next_state)) env.reset() _, reward, done, _ = env.step(env.action_space.sample()) state = get_state() else: memory.add((state, action, reward, next_state)) state = next_state img_shape = state.shape network = QNetwork(height=img_shape[0], width=img_shape[1], channel=img_shape[2], learning_rate=learning_rate) saver = tf.train.Saver() save_file = 'checkpoints/cartpole.ckpt' rewards_list = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) step = 0 for ep in range(1, train_episodes + 1): total_reward = 0 t = 0 while t < max_steps: step += 1 env.render() explore_p = explore_stop + \ (explore_start - explore_stop) * np.exp(-decay_rate * step) if explore_p > np.random.rand(): action = env.action_space.sample() else: feed = {network.inputs_: state.reshape((1, *state.shape))} Qs = sess.run(network.output, feed_dict=feed) action = np.argmax(Qs) _, reward, done, _ = env.step(action) next_state = get_state() total_reward += reward if done: next_state = np.zeros(state.shape) t = max_steps print('Episode: {}'.format(ep), 'Total reward: {}'.format(total_reward), 'Training loss: {:.4f}'.format(loss), 'Explore Prob: {:.4f}'.format(explore_p)) rewards_list.append((ep, total_reward)) memory.add((state, action, reward, next_state)) env.reset() _, reward, done, _ = env.step(env.action_space.sample()) state = get_state() else: memory.add((state, action, reward, next_state)) state = next_state t += 1 batch = memory.sample(batch_size) states = np.array([each[0] for each in batch]) actions = np.array([each[1] for each in batch]) rewards = np.array([each[2] for each in batch]) next_states = np.array([each[3] for each in batch]) target_Qs = sess.run(network.output, feed_dict={network.inputs_: next_states}) temp_shape = next_states.shape is_episode_over = (next_states.reshape( (temp_shape[0], -1)) == np.zeros( (temp_shape[1] * temp_shape[2] * temp_shape[3]))).all(axis=1) target_Qs[is_episode_over] = (0, 0, 0, 0) targets = rewards + gamma * np.max(target_Qs, axis=1) loss, _ = sess.run( [network.loss, network.opt], feed_dict={ network.inputs_: states, network.targetQs_: targets, network.actions_: actions }) saver.save(sess, save_file)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = QNetwork(state_size, action_size).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences states = torch.from_numpy(states).float().to(device) actions = torch.from_numpy(actions).long().to(device) rewards = torch.from_numpy(rewards).float().to(device) next_states = torch.from_numpy(next_states).float().to(device) dones = torch.from_numpy(dones).float().to(device) # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): '''Interacts and learns from the environment''' def __init__(self, state_size, action_size, seed): """Initialize an Agent object Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed(int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # initialise the timestep (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in the replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps self.t_step = (self.t_step +1) % UPDATE_EVERY if self.t_step ==0: # Get random subset from the memory, but ONLY if there are enough samples if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy Params ------ state(array_like): current state eps(float): epsilon, epsilon-greedy action selection (to keep element of exploration) """ # convert the state from the Unity network into a torch tensor state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Note to pass it through the deep network, we need to take the numpy array and: # 1 - convert it to torch array with from_numpy() # 2 - convert it to float 32 as that is what is expected. Use .float() # 3 - Add a dimension on axis 0 with .unsqueeze(0). Because pytorch expects a BATCH of 1 dimensional arrays # to be fed into its network. For example feeding in a batch of 64 arrays, each of length 37. In our case, # with reinforcement learning we are only feeding one at a time, but the network still expects it to be 2D. self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value paratmers of the deep-Q network using given batch of experience tuples Params ------ experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get the max predicted Q values for the next states, from the target model # note: detach just detaches the tensor from the grad_fn - i.e. we are going to do some non-tracked # computations based on the value of this tensor (we DON'T update the target model at this stage) Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1-dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimise the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (Pytorch model): weights will be copied from taret_model (Pytorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau) * target_param.data)
class DQNAgent(): def __init__(self, game_name="pong"): self.LEARNING_RATE = 1e-4 self.eps_init = 1 self.eps_final = 0.02 self.schedule_timesteps = 5e4 # tot_timestep * explo_frac self.eps = self.eps_init self.GAMMA = 0.95 self.BATCH_SIZE = 32 self.TARGET_UPDATE_C = 1000 self.MEMORY_CAPACITY_N = 200 self.episode_M = 5000 self.GAME_ENV = { "pong": "PongNoFrameskip-v4", "cartpole": "CartPole-v0", } self.game_name = game_name env_name = self.GAME_ENV[self.game_name] self.env = wrapEnv(gym.make(env_name)) self.reward_list = [] self.Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) self.Qnetwork.summary() def selectAction(self, eval_Qnetwork, state, is_train=True): """ 使用epsilon-greedy策略选择动作 使用神经网络近似值函数 """ if random.random() <= self.eps and is_train: action = self.env.action_space.sample() else: action = np.argmax( eval_Qnetwork.predict([ np.ones((1, self.env.action_space.n)), np.expand_dims(np.array(state), 0) ])) # (1, 84, 84, 4) 增加一个维度 - batch_size return action def updateQNetwork(self, eval_Qnetwork, target_Qnetwork, sample_batch, double=True): """ DDQN/DQN 梯度下降更新网络 """ # states = np.array([a[0] for a in sample_batch]) # actions = np.array([a[1] for a in sample_batch]) # rewards = np.array([a[2] for a in sample_batch]) # next_states = np.array([a[3] for a in sample_batch]) # dones = np.array([a[4] for a in sample_batch]) states, actions, rewards, next_states, dones = [], [], [], [], [] for i in range(len(sample_batch)): states.append(np.array(sample_batch[i][0], copy=False)) actions.append(sample_batch[i][1]) rewards.append(sample_batch[i][2]) next_states.append(np.array(sample_batch[i][3], copy=False)) dones.append(sample_batch[i][4]) states = np.array(states) actions = np.array(actions) rewards = np.array(rewards) next_states = np.array(next_states) dones = np.array(dones) ones_mat = np.ones((len(sample_batch), self.env.action_space.n)) if double == True: eval_actions = np.argmax(eval_Qnetwork.predict( [ones_mat, next_states]), axis=1) target_action_Qvalue = target_Qnetwork.predict( [ones_mat, next_states])[range(len(sample_batch)), eval_actions] else: target_action_Qvalue = np.max(target_Qnetwork.predict( [ones_mat, next_states]), axis=1) # y_true = eval_Qnetwork.predict(states) # y_true[range(len(y_true)), actions] = rewards + (1-dones)*self.GAMMA * target_action_Qvalue select_actions = np.zeros((len(sample_batch), self.env.action_space.n)) select_actions[range(len(sample_batch)), actions] = 1 y_true = rewards + (1 - dones) * self.GAMMA * target_action_Qvalue eval_Qnetwork.fit(x=[select_actions, states], y=select_actions * np.expand_dims(y_true, axis=1), epochs=1, batch_size=len(sample_batch), verbose=0) def dqnTrain(self, double=True): step = 0 memory = ReplayMemory(self.MEMORY_CAPACITY_N) eval_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) target_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) eval_Qnetwork.set_weights(self.Qnetwork.get_weights()) target_Qnetwork.set_weights(eval_Qnetwork.get_weights()) reward_list = self.reward_list time_start = time.time() for episode in range(1, self.episode_M + 1): episode_reward = 0 state = self.env.reset() while True: step += 1 action = self.selectAction(eval_Qnetwork, state) next_state, reward, done, _ = self.env.step(action) episode_reward += reward memory.add((state, action, reward, next_state, done)) state = next_state if len(memory) > self.BATCH_SIZE: sample_batch = memory.sample(self.BATCH_SIZE) self.updateQNetwork(eval_Qnetwork, target_Qnetwork, sample_batch, double) # self.EPS = self.EPS*self.EPS_DECAY if self.EPS > self.EPS_MIN else self.EPS_MIN eps_fraction = min( float(step) / self.schedule_timesteps, self.eps_init) self.eps = self.eps_init + eps_fraction * (self.eps_final - self.eps_init) if step % self.TARGET_UPDATE_C == 0: target_Qnetwork.set_weights(eval_Qnetwork.get_weights()) if done: break reward_list.append(episode_reward) print( "episode: {}, reward: {}, tot_step: {}, {}min. eps: {}".format( episode, episode_reward, step, (time.time() - time_start) / 60, self.eps)) if episode % 5 == 0: print( "episode {}. recent 5 episode_reward:{}. using {} min. total step: {}. " .format(episode, self.reward_list[-5:], (time.time() - time_start) / 60, step)) if episode % 50 == 0: self.save(target_Qnetwork, reward_list) self.Qnetwork.set_weights(target_Qnetwork.get_weights()) self.reward_list = reward_list return target_Qnetwork, reward_list def load(self, filename_prefix=None): if filename_prefix == None: filename_prefix = "pong/data/ddqn_bs" + str(self.BATCH_SIZE) self.Qnetwork = keras.models.load_model(filename_prefix + "network.h5") with open(filename_prefix + "reward.json", 'r') as file_obj: self.reward_list = json.loads(file_obj.read()) def save(self, Qnetwork=None, reward_list=None, filename_prefix=None): if Qnetwork == None: Qnetwork = self.Qnetwork if reward_list == None: reward_list = self.reward_list if filename_prefix == None: filename_prefix = "pong/data/ddqn_bs_" + str(self.BATCH_SIZE) Qnetwork.save(filename_prefix + "network.h5") with open(filename_prefix + "reward.json", 'w') as file_obj: file_obj.write(json.dumps(reward_list)) def playByQv(self, Qnetwork=None, episode_num=1): if Qnetwork == None: Qnetwork = self.Qnetwork for episode in range(1, episode_num + 1): state = self.env.reset() while True: self.env.render() action = self.selectAction(Qnetwork, state, is_train=False) state, reward, done, _ = self.env.step(action) time.sleep(0.02) if done: break def plotReward(self, reward_list=None): if reward_list == None: reward_list = self.reward_list
def dqnTrain(self, double=True): step = 0 memory = ReplayMemory(self.MEMORY_CAPACITY_N) eval_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) target_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) eval_Qnetwork.set_weights(self.Qnetwork.get_weights()) target_Qnetwork.set_weights(eval_Qnetwork.get_weights()) reward_list = self.reward_list time_start = time.time() for episode in range(1, self.episode_M + 1): episode_reward = 0 state = self.env.reset() while True: step += 1 action = self.selectAction(eval_Qnetwork, state) next_state, reward, done, _ = self.env.step(action) episode_reward += reward memory.add((state, action, reward, next_state, done)) state = next_state if len(memory) > self.BATCH_SIZE: sample_batch = memory.sample(self.BATCH_SIZE) self.updateQNetwork(eval_Qnetwork, target_Qnetwork, sample_batch, double) # self.EPS = self.EPS*self.EPS_DECAY if self.EPS > self.EPS_MIN else self.EPS_MIN eps_fraction = min( float(step) / self.schedule_timesteps, self.eps_init) self.eps = self.eps_init + eps_fraction * (self.eps_final - self.eps_init) if step % self.TARGET_UPDATE_C == 0: target_Qnetwork.set_weights(eval_Qnetwork.get_weights()) if done: break reward_list.append(episode_reward) print( "episode: {}, reward: {}, tot_step: {}, {}min. eps: {}".format( episode, episode_reward, step, (time.time() - time_start) / 60, self.eps)) if episode % 5 == 0: print( "episode {}. recent 5 episode_reward:{}. using {} min. total step: {}. " .format(episode, self.reward_list[-5:], (time.time() - time_start) / 60, step)) if episode % 50 == 0: self.save(target_Qnetwork, reward_list) self.Qnetwork.set_weights(target_Qnetwork.get_weights()) self.reward_list = reward_list return target_Qnetwork, reward_list
class Brain: def __init__(self, params): self.num_actions = params['num_actions'] self.device = params['device'] self.batch_size = params['batch_size'] self.learning_rate = params['learning_rate'] self.gamma = params['gamma'] self.eps_start = params['eps_start'] self.eps_end = params['eps_end'] self.eps_decay = params['eps_decay'] self.policy_net = QNetwork(self.num_actions).to(self.device) self.target_net = QNetwork(self.num_actions).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(params['replay_memory_size']) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) self.steps_done = 0 self.q_vals = [0] * self.num_actions self.loss = 0 def decide_action(self, state): eps_threshold = self.eps_end + ( self.eps_start - self.eps_end) * math.exp( -1. * self.steps_done / self.eps_decay) self.steps_done += 1 with torch.no_grad(): self.q_vals = self.policy_net( torch.from_numpy(state).float().to(self.device).unsqueeze(0)) sample = random.random() if sample > eps_threshold: with torch.no_grad(): return self.q_vals.max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(self.num_actions)]], device=self.device, dtype=torch.long) def optimize(self): transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat([ torch.tensor(s, device=self.device, dtype=torch.float) for s in batch.next_state if s is not None ]) state_batch = torch.cat( [torch.tensor(batch.state, device=self.device, dtype=torch.float)]) action_batch = torch.cat( [torch.tensor(batch.action, device=self.device, dtype=torch.long)]) reward_batch = torch.cat( [torch.tensor(batch.reward, device=self.device, dtype=torch.int)]) state_action_values = self.policy_net(state_batch).gather( 1, action_batch.unsqueeze(1)) next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states.unsqueeze(1)).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch self.loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() self.loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict())
def main(env, N_EPISODE=100, MAX_STEPS=300, SUCESS_STEP=200, STOP_MAX_EPISODE=5, EPSILONE_START=1.0, EPSILONE_END=0.01, EPSILONE_DECAY=0.001, GAMMA=0.99, WARMUP=10, MEMORYSIZE=10000, BATCHSIZE=32, PLOT=True): def _update_target_parameter(_main_qn, _target_qn): _target_qn.model.set_weights(_main_qn.model.get_weights()) return _target_qn def _get_egreedy_actions(epsilon, actions): if random.random() < epsilon: action_num = actions.shape[1] action = random.choice([a for a in range(action_num)]) else: action = np.argmax(actions[0]) return action def _update_parameter(_main_qn, _target_qn, _memory, BATCHSIZE): memories = _memory.sample(BATCHSIZE) states = np.zeros((BATCHSIZE, statesize)) targets = np.zeros((BATCHSIZE, actionsize)) for m_ind, (_state, _action, _reward, _n_state) in enumerate(memories): _state_arr = _state.reshape(1, statesize) _n_state_arr = _n_state.reshape(1, statesize) states[m_ind] = _state_arr if not (_n_state_arr == np.zeros((1, statesize))).all(axis=1): target = _reward + GAMMA * \ np.amax(_target_qn.model.predict(_n_state_arr)[0]) else: target = _reward targets[m_ind] = _main_qn.model.predict(_state_arr) targets[m_ind][_action] = target _main_qn.model.fit(states, targets, epochs=1, verbose=0) return _main_qn # Get Env parameter actionsize = env.get_action_space() statesize = env.get_observation_space() # Create Network main_qn = QNetwork(statesize, actionsize) target_qn = QNetwork(statesize, actionsize) # Create Memory memory = ExperienceReplayd(MEMORYSIZE) epsilon = EPSILONE_START # Dataholder for plotting if PLOT: history = {"episode": [], "step": []} # Repeat Episode total_step = 0 success_episode = 0 for i in range(N_EPISODE): print("Episode : " + str(i)) # initialize Env state = env.reset() # Update target network parameter target_qn = _update_target_parameter(main_qn, target_qn) # Take actions for steps in range(MAX_STEPS): total_step += 1 # Decay Epsilon epsilon = EPSILONE_END + \ (EPSILONE_START - EPSILONE_END) * \ np.exp(-EPSILONE_DECAY*total_step) print("Step : " + str(steps) + " Epsilon : " + str(epsilon)) state_arr = state.reshape(1, statesize) actions = target_qn.model.predict(state_arr) choiced_action = _get_egreedy_actions(epsilon, actions) n_state, reward, done, info = env.step(choiced_action) if done: n_state = np.zeros(n_state.shape) if steps >= WARMUP: m = (state, choiced_action, reward, n_state) memory.add(m) if memory.get_cnt() > BATCHSIZE: main_qn = _update_parameter( main_qn, target_qn, memory, BATCHSIZE) if done: print("Done:" + str(done)) success_episode = 0 break state = n_state if steps >= SUCESS_STEP - 1: success_episode += 1 print("MAX EPISODE") break if PLOT: history["episode"].append(i) history["step"].append(steps) epi = history["episode"] stp = history["step"] plt.plot(epi, stp, 'b') plt.title('Max steps per episode') plt.legend() plt.savefig("plot.png") if success_episode >= STOP_MAX_EPISODE: print("SUCCESS!!") break
class Agent: def __init__(self, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, lr=LR, epsilon=EPSILON): # local network for estimate # target network for computing target self.batch_size = batch_size self.epsilon = epsilon self.network_loc = QNetwork() self.network_targ = QNetwork() setWeights(self.network_loc, self.network_targ) self.optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.buffer = ReplayBuffer(buffer_size=buffer_size, batch_size=batch_size) self.actions = actionlst() self.context_extractor = ContextExtractor() self.state_target = None # Build model so it knows the input shape self.network_loc.build(tf.TensorShape([ None, STATE_LENGTH, ])) self.network_targ.build(tf.TensorShape([ None, STATE_LENGTH, ])) def __getState(self, img): # Private method to get state from an numpy image img_resize = tf.image.resize_images(img, [VGG_SHAPE, VGG_SHAPE]) / 255.0 ctx = self.context_extractor(img_resize.numpy()) color = get_histogram(img) return combine(color, ctx) def __getAction(self, state, epsilon): # Epsilon greedy policy # Add batch dimension state = np.expand_dims(state, 0) predicts = self.network_loc(state) action = np.argmax(predicts) state = np.squeeze(state, 0) random = np.random.choice(12, 1)[0] if np.random.random_sample() > (1 - epsilon): return random else: return action def clearBuffer(self): self.buffer.clear() def setTarget(self, target): # This should be called at the beginning of # each (src, target) pair training self.state_target = self.__getState(target) def predict(self, img): # Given an image, return the updated image state_cur = self.__getState(img) state_cur = state_cur.astype(np.float32) action = self.__getAction(state_cur, 0) img_nxt = applyChange(self.actions, action, img) return img_nxt, state_cur def step(self, img_prev): # Given input image as numpy array # Return (s,a,s',r) and the img after action # 1. Extract features state_prev = self.__getState(img_prev) state_prev = state_prev.astype(np.float32) # 2. Feed to local network and get action action = self.__getAction(state_prev, self.epsilon) # 3. Apply action and get img_cur, state_cur img_cur = applyChange(self.actions, action, img_prev) state_cur = self.__getState(img_cur) # Only float32 can be feed to network state_cur = state_cur.astype(np.float32) # 4. Calculate reward r = reward(state_prev, state_cur, self.state_target) # Return (s,a,s',r) tuple and img_cur return (state_prev, action, state_cur, r), img_cur def record(self, state_prev, action, state_cur, reward): # Save (s,a,s',r) to replay buffer self.buffer.add(state_prev, action, state_cur, reward) def learn(self): # 1. Sample batch from replay buffer # Only sample whole batch state_ps, actions, state_cs, rs = self.buffer.sample() if state_ps.shape[0] == 0: return False # Debug code assert state_ps.shape == (state_ps.shape[0], STATE_LENGTH) assert actions.shape == (actions.shape[0], ) assert state_cs.shape == (state_cs.shape[0], STATE_LENGTH) assert rs.shape == (rs.shape[0], ) # 2. Compute loss based on q_target and q_estimate index = [] for idx, a in enumerate(actions): index.append([idx, a]) with tf.GradientTape() as tape: q_est = tf.gather_nd(self.network_loc(state_ps), index) q_targ = tf.reduce_max(self.network_targ(state_cs), axis=1) target = rs + GAMMA * q_targ loss = tf.losses.mean_squared_error(target, q_est) # 3. Back prop grads = tape.gradient(loss, self.network_loc.variables) self.optimizer.apply_gradients(zip(grads, self.network_loc.variables)) # 4. Soft updates self.__soft_update() return True def __soft_update(self): # Slowly update the target network # Iterate through all layers and set weights for layer_t, layer_loc in \ zip(self.network_targ.layers, self.network_loc.layers): target = layer_t.get_weights() loc = layer_loc.get_weights() for i in range(len(target)): target[i] = (1 - TAU) * target[i] + TAU * loc[i] layer_t.set_weights(target)
class Agent(object): def __init__(self, state_size, action_size, seed, config): self.state_size = state_size self.action_size = action_size self.config = config self.seed = random.seed(seed) self.local_q_net = QNetwork(state_size, action_size, seed).to(device) self.target_q_net = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_q_net.parameters(), lr=config["LR"]) self.memory = ReplayBuffer(action_size, config["BUFFER_SIZE"], config["BATCH_SIZE"], seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.config["UPDATE_EVERY"] if self.t_step == 0: # if agent experienced enough if len(self.memory) > self.config["BATCH_SIZE"]: experiences = self.memory.sample() # Learn from previous experiences self.learn(experiences, self.config["GAMMA"]) def act(self, state, eps=0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_q_net.eval() with torch.no_grad(): action_values = self.local_q_net(state) self.local_q_net.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): # Double Q Learning states, actions, rewards, next_states, dones = experiences # Get next action estimation with local q network q_targets_next_expected = self.local_q_net(next_states).detach() q_targets_next_expected_actions = q_targets_next_expected.max( 1)[1].unsqueeze(1) # Calculate Next Targets q_targets_next = self.target_q_net(next_states).gather( 1, q_targets_next_expected_actions) # Non over-estimated targets q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Expected value q_expected = self.local_q_net(states).gather(1, actions) loss = torch.nn.functional.mse_loss(q_expected, q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.local_q_net, self.target_q_net, self.config["TAU"]) def soft_update(self, local_net, target_net, tau): for target_param, local_param in zip(target_net.parameters(), local_net.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ def loss_dqn(output, target): loss = target - output return (target - output)**2 states, actions, rewards, next_states, dones = experiences # Reset gradients # Calculate the value of the target in the next state pred = self.qnetwork_target(next_states) # (64, 4) target = rewards # (64, 1) for i in range(BATCH_SIZE): # Check for dones if dones[i] == False: target[i] = rewards[i] + GAMMA * torch.max(pred[i]) # The loss output = self.qnetwork_local(states) # Use gather in order to have the correct slicing output_action_value = output.gather(1, actions.view(-1, 1)) loss = loss_dqn(output_action_value, target).mean() # Reset gradients self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def create_network(self): """ Creates and initialises the network Returns: A dict containing the network properties {'input': input, 'output': output, 'target': target, 'action': action, 'training_step': training_step} """ # IO Placeholders input = tf.placeholder(tf.float32, shape=[None, self.image_height, self.image_width, self.input_frame_length]) target = tf.placeholder(tf.float32, shape=[None]) action = tf.placeholder(tf.float32, shape=[None, self.num_actions]) # First Layer W_conv1 = QNetwork.weight_variable([3, 3, self.input_frame_length, 16]) b_conv1 = QNetwork.bias_variable([16]) h_conv1 = tf.nn.relu(QNetwork.conv2d(input, W_conv1) + b_conv1) h_pool1 = QNetwork.max_pool_2x2(h_conv1) # Second Layer W_conv2 = QNetwork.weight_variable([3, 3, 16, 32]) b_conv2 = QNetwork.bias_variable([32]) h_conv2 = tf.nn.relu(QNetwork.conv2d(h_pool1, W_conv2) + b_conv2) h_pool2 = QNetwork.max_pool_2x2(h_conv2) # Fourth Layer W_fc1 = QNetwork.weight_variable([((self.image_height / 4) * (self.image_width / 4) * 32), 256]) b_fc1 = QNetwork.bias_variable([256]) h_pool2_flat = tf.reshape(h_pool2, [-1, (self.image_height / 4) * (self.image_width / 4) * 32]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) # Fith Layer W_fc2 = QNetwork.weight_variable([256, self.num_actions]) b_fc2 = QNetwork.bias_variable([self.num_actions]) output = tf.matmul(h_fc1, W_fc2) + b_fc2 # Train and Eval Steps action_value = tf.reduce_sum(tf.mul(output, action), reduction_indices = 1) error = tf.reduce_mean(tf.square(target - action_value)) training_step = tf.train.AdamOptimizer(1e-6).minimize(error) QNetwork.variable_summaries(output, 'output') QNetwork.variable_summaries(error, 'error') QNetwork.variable_summaries(W_fc2, 'final_weights') return {'input': input, 'output': output, 'target': target, 'action': action, 'training_step': training_step}