class CartPoleAgent(): def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size # discount rate self.gamma = 0.95 # exploration rate self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.memory = Memory(2000) self.DQN = DQN(state_size, action_size) def act(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) act_values = self.DQN.predict(state) return np.argmax(act_values[0]) # returns action def remember(self, state, action, reward, next_state, done): self.memory.add((state, action, reward, next_state, done)) def replay(self, batch_size): minibatch = self.memory.sample(batch_size) for state, action, reward, next_state, done in minibatch: target = reward if not done: target = reward + self.gamma * np.amax( self.DQN.predict(next_state)[0]) target_f = self.DQN.predict(state) target_f[0][action] = target self.DQN.train(state, target_f, epochs=1) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay
with tf.Session() as sess: model = DQN(sess, state_size, action_size, learning_rate, gamma) memory = Memory(memory_size) while True: state = env.reset() for t in range(scene_iteration): env.render() # Play a new action action = model.select_action(state) new_state, reward, done, info = env.step(np.argmax(action)) # if done: # reward = -1.0 memory.add((state, new_state, reward, action, done)) # After batch_size experiences train the model if len(memory.buffer) > batch_size: state_batch, new_state_batch, reward_batch, action_batch, done_batch = memory.sample( batch_size) model.learn(state_batch, new_state_batch, reward_batch, action_batch, done_batch) state = new_state # Test phase if len(memory.buffer) > test_batch_size: state_batch, new_state_batch, reward_batch, action_batch, done_batch = memory.sample( test_batch_size) current_Q = np.max(sess.run(model.output, feed_dict={
class Agent(object): def __init__(self, state_count, action_count): self.state_count = state_count self.action_count = action_count self.brain = Brain(state_count, action_count) self.memory = Memory(MEMORY_CAPACITY) self.epsilon = MAX_EPSILON self.steps = 0 def act(self, s): if random.random() < self.epsilon: return random.randint(0, self.action_count - 1) else: return np.argmax(self.brain.predict_one(s)) def observe(self, samples): self.memory.add(samples) if self.steps % UPDATE_TARGET_FREQUENCY == 0: self.brain.update_target() # slowly decrease Epsilon based on our eperience self.steps += 1 self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp( -LAMBDA * self.steps) def replay(self): batch = self.memory.sample(BATCH_SIZE) batchLen = len(batch) no_state = np.zeros(self.state_count) states = np.array([o[0] for o in batch]) states_ = np.array([(no_state if o[3] is None else o[3]) for o in batch]) p = self.brain.predict(states) #p_ = self.brain.predict(states_, target=True) p_ = self.brain.predict(states_, target=False) pTarget_ = self.brain.predict(states_, target=True) x = np.zeros((batchLen, self.state_count)) y = np.zeros((batchLen, self.action_count)) for i in range(batchLen): o = batch[i] s = o[0] a = o[1] r = o[2] s_ = o[3] t = p[i] if s_ is None: t[a] = r else: #t[a] = r + GAMMA * np.amax(p_[i]) t[a] = r + GAMMA * pTarget_[i][np.argmax(p_[i])] # double DQN x[i] = s y[i] = t self.brain.train(x, y)
# print("step:", i) # If it's the first step if i == 0: # First we need a state state = game.get_state().screen_buffer state, stacked_frames = stack_frames(stacked_frames, state, True) # Random action action = random.choice(possible_actions) # Get the rewards reward = game.make_action(action) done = game.is_episode_finished() if done: # We finished the episode next_state = np.zeros(state.shape) # Add experience to memory memory.add((state, action, reward, next_state, done)) # Start a new episode game.new_episode() # First we need a state state = game.get_state().screen_buffer # Stack the frames state, stacked_frames = stack_frames(stacked_frames, state, True) else: # Get the next state next_state = game.get_state().screen_buffer next_state, stacked_frames = stack_frames(stacked_frames, next_state, False) # Add experience to memory memory.add((state, action, reward, next_state, done)) # Our state is now the next_state state = next_state
class DistributionalRL: def __init__(self, actions, gamma=0.1, e_greedy=0.9): state_size = 1 neurons = 24 self.actions = actions self.gamma = gamma self.epsilon = e_greedy self.lr = 0.1 self.count = 0 self.epochs = 5 self.v_max = 10 self.v_min = -10 self.atoms = 51 self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)] self.m = Build_Model(state_size, neurons, len(actions), atoms=self.atoms) self.model = self.m.model self.dump_model = copy.copy(self.model) self.capacity = 300 self.memory = Memory(self.capacity) @timecost def choose_action(self, s): if np.random.uniform() < self.epsilon: # choose the best action state_action = [] for i in self.model.predict([s]): state_action.append( np.sum([self.z[j] * i[0][j] for j in range(self.atoms)])) action = np.random.choice([ i for i in range(len(state_action)) if state_action[i] == max(state_action) ]) else: # choose action randomly action = np.random.choice(self.actions) return action @timecost def learn(self, s, a, r, s_, done): batch_size = 50 record_size = self.capacity loss, q_distribution = self.get_q_value(s, a, r, s_, done) self.memory.add(loss, [s, a, r, s_, q_distribution]) self.count += 1 # train when do record_size times actions. if self.count % record_size == 0: batch, idxs, is_weight = self.memory.sample(batch_size) X_train = np.zeros((batch_size, 1)) Y_train = [ np.zeros((batch_size, self.atoms)) for i in range(len(self.actions)) ] for i in range(batch_size): X_train[i] = batch[i][0] for i_ in range(len(self.actions)): Y_train[i_][i][:] = batch[i][4][i_][:] print('-----training-----') self.model.fit(X_train, Y_train, epochs=3, verbose=0) print('training') # update prioritized experience for i in range(batch_size): #_s, _a, _r, _s_ = batch[i][0], batch[i][1], batch[i][2], batch[i][3] #loss = self.get_q_value(_s, _a, _r, _s_, done)[0] self.memory.update(idxs[i], 0) @timecost def get_q_value(self, s, a, r, s_, done): p = self.model.predict([s]) old_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1) # 一樣有 double dqn p_next = self.model.predict([s_]) p_d_next = self.dump_model.predict([s_]) q = np.sum(np.multiply(np.vstack(p_next), np.array(self.z)), axis=1) next_action_idxs = np.argmax(q) # init m 值 m_prob = [np.zeros((1, self.atoms))] # action 後更新 m 值 if done: # Distribution collapses to a single point Tz = min(self.v_max, max(self.v_min, r)) bj = (Tz - self.v_min) / self.delta_z m_l, m_u = math.floor(bj), math.ceil(bj) m_prob[0][0][int(m_l)] += (m_u - bj) m_prob[0][0][int(m_u)] += (bj - m_l) else: for j in range(self.atoms): Tz = min(self.v_max, max(self.v_min, r + self.gamma * self.z[j])) bj = (Tz - self.v_min) / self.delta_z m_l, m_u = math.floor(bj), math.ceil(bj) m_prob[0][0][int( m_l)] += p_d_next[next_action_idxs][0][j] * (m_u - bj) m_prob[0][0][int( m_u)] += p_d_next[next_action_idxs][0][j] * (bj - m_l) # 更新後放回p,回去訓練 p[a][0][:] = m_prob[0][0][:] # 計算q估計 new_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1) #計算 error 給PER error = abs(old_q[a] - new_q[a]) return error, p
for _ in range(N_EPISODES_PRETRAIN): env.reset() while True: states = dict() actions = dict() rewards = dict() next_states = dict() step_action = [] for agent in agent_names: states[agent] = env.get_observation(agent) action = random.randint(0, 1) actions[agent] = action rewards, next_states, is_finished = env.set_action(actions) memory.add([states, actions, rewards, next_states, is_finished]) if is_finished: break # function to update Q learning def update_model(): minibatch = memory.sample(BATCH_SIZE) batch_states = [] batch_targets = [] for states, actions, rewards, next_states, done in minibatch: np_states = [] np_actions = [] np_rewards = [] np_next_states = []
class QLearningTable: def __init__(self, actions, learning_rate=0.001, reward_decay=0.9, e_greedy=0.4): self.actions = actions # a list self.lr = learning_rate self.gamma = reward_decay self.epsilon = e_greedy self.batch_size = 25 self.state_size = 4 # neural network M = Build_Model(4, 4, 4) self.model = M.build() self.target_model = copy.copy(self.model) self.optimizer = tf.optimizers.Adam(lr=self.lr) self.epochs = 1 # memory self.capacity = 200 self.memory = Memory(self.capacity) self.store_times = 0 def choose_action(self, s): # action selection if np.random.uniform() < self.epsilon: # choose best action s = np.array(s) state_action = self.model.predict([[s]])[0] print(state_action) # some actions may have the same value, randomly choose on in these actions action = np.argmax(state_action) else: # choose random action action = np.random.choice(self.actions) return action def store_memory(self, s, a, r, s_): if r in [1, -1]: self.memory.add(100, [s, a, r, s_]) self.memory.add(100, [s, a, r, s_]) else: self.memory.add(1, [s, a, r, s_]) self.store_times += 1 def learn(self): self.loss_record = [] batch, index, is_weight = self.memory.sample(self.batch_size) # initial the training data X_train = np.zeros((self.batch_size, self.state_size)) Y_train = [np.zeros(len(self.actions)) for i in range(self.batch_size)] for i in range(self.batch_size): s, a, r, s_ = batch[i][0], batch[i][1], batch[i][2], batch[i][3], q_table = self.model.predict([[s]])[0] q_predict = q_table[a] if s_ != 'terminal': q_next_table = self.target_model.predict([[s_]])[0] next_action = np.argmax(self.model.predict([[s]])[0]) q_target = r + self.gamma * q_next_table[ next_action] # next state is not terminal else: q_target = r # next state is terminal loss = abs(q_target) q_table[a] += (q_target - q_predict) # store memory self.loss_record.append(loss) # setup training data X_train[i] = s for i_ in range(len(self.actions)): Y_train[i][i_] = q_table[i_] #training for epoch in range(self.epochs): self.train(self.model, X_train, Y_train) # memory update for i in range(self.batch_size): self.memory.update(index[i], self.loss_record[i]) def _loss(self, model, x, y): x = np.array(x) y_ = model([[x]]) loss = huber_loss(y, y_) return loss def _grad(self, model, inputs, targets): with tf.GradientTape() as tape: loss_value = self._loss(model, inputs, targets) #self.epoch_loss_avg(loss_value) return tape.gradient(loss_value, self.model.trainable_variables) def train(self, model, s, q): grads = self._grad(model, s, q) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables), get_or_create_global_step())
class SmartAgent: def __init__(self, stack_size, max_memory_size, model_save_path): self.env = gym.make('BreakoutDeterministic-v4') self.memory = Memory(max_memory_size) self.frame_stack_size = stack_size self.explore_prob = .08 self.explore_prob_final = 0.01 self.explore_decay = .995 self.DQN = DQNModel(self.env.action_space.n, stack_size) self.num_exps = 0 self.model_save_path = model_save_path self.discount = .99 def setup_DQN(self, load_existing=None, model_path=None): if load_existing: self.DQN.restore_model(model_path) else: self.DQN.build_target_model() self.DQN.build_prediction_model() self.DQN.save_model_params(self.model_save_path) def train(self, batch_size, num_epochs, update_target): mem_block = self.memory.sample(batch_size) self.DQN.train(mem_block, self.discount, num_epochs) if self.num_exps % 1000 == 0: self.DQN.save_model_params(self.model_save_path) self.DQN.update_target_model() if self.explore_prob > self.explore_prob_final: self.explore_prob *= 0.9999 def test_policy(self): is_new_episode = True frame_stack = Util.new_frame_stack(self.frame_stack_size) first_frame = self.env.reset() # self.env.render() state, frame_stack = Util.stack_frames(frame_stack, self.frame_stack_size, first_frame, is_new_episode) episode_is_done = False while not episode_is_done: print(self.num_exps) is_new_episode = False # Take an action num_acts = 0 action = self.DQN.get_next_action(state, self.explore_prob) frame, reward, episode_is_done, _ = self.env.step(action) self.env.render() next_state, frame_stack = Util.stack_frames( frame_stack, self.frame_stack_size, frame, is_new_episode) state = next_state def gather_experience(self, num_training): for i in range(num_training): pts = 0 episode_is_done = False is_new_episode = True # Start the environment and put the first frame into a stack frame_stack = Util.new_frame_stack(self.frame_stack_size) first_frame = self.env.reset() #self.env.render() state, frame_stack = Util.stack_frames(frame_stack, self.frame_stack_size, first_frame, is_new_episode) # for i in range(4): #3plt.imshow(state[:,:,0]) #plt.show() #print(state.shape) num_consec_actions = 4 num_acts = 0 while not episode_is_done: print(self.num_exps) is_new_episode = False # Take an action if num_acts % num_consec_actions == 0: num_acts = 0 action = self.DQN.get_next_action(state, self.explore_prob) frame, reward, episode_is_done, _ = self.env.step(action) pts += reward num_acts += 1 #print(reward) #plt.imshow(frame) #plt.show() self.env.render() next_state, frame_stack = Util.stack_frames( frame_stack, self.frame_stack_size, frame, is_new_episode) # for i in range(4): # plt.imshow(next_state[:, :, i]) # plt.show() experience = (state, action, np.sign(reward), next_state, episode_is_done) state = next_state self.memory.add(experience) self.num_exps += 1 if self.num_exps >= 100: if (self.num_exps - 200) % 4 == 0: self.train(32, 1, True) #print('Explore Prob:', self.explore_prob) print('Total points:', pts)
class Agent: def __init__(self, actions, gamma=0.1, e_greedy=0.9): self.state_size = 4 neurons = 24 self.actions = actions self.gamma = gamma self.epsilon = e_greedy self.lr = 0.1 self.count = 0 self.epochs = 50 self.v_max = 10 self.v_min = -10 self.atoms = 51 self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)] self.m = Build_Model(self.state_size, neurons, len(actions), atoms=self.atoms) self.m.build() self.model = self.m.model self.dump_model = copy.copy(self.model) self.optimizer = tf.optimizers.Adam(lr=self.lr) self.batch_size = 100 self.capacity = 300 self.memory = Memory(self.capacity) self.record_size = self.capacity @timecost def choose_action(self, s): if np.random.uniform() < self.epsilon: # choose the best action state_action = [] for i in self.model.predict([[s]]): state_action.append( np.sum([self.z[j] * i[0][j] for j in range(self.atoms)])) action = np.random.choice([ i for i in range(len(state_action)) if state_action[i] == max(state_action) ]) else: # choose action randomly action = np.random.choice(self.actions) return action #@timecost def learn(self, s, a, r, s_, done): loss, q_distribution = self.get_q_value(s, a, r, s_, done) self.memory.add(loss, [s, a, r, s_, q_distribution]) self.count += 1 # train when do record_size times actions. if self.count % self.record_size == 0: batch, idxs, is_weights = self.memory.sample(self.batch_size) X_train = np.zeros((self.batch_size, self.state_size)) Y_train = [ np.zeros((self.batch_size, self.atoms)) for i in range(len(self.actions)) ] for i in range(self.batch_size): X_train[i] = batch[i][0] for i_ in range(len(self.actions)): Y_train[i_][i][:] = batch[i][4][i_][:] print('-----training-----') for i in range(self.epochs): self.train(X_train, Y_train) # update prioritized experience for i in range(self.batch_size): _s, _a, _r, _s_, is_weight = batch[i][0], batch[i][1], batch[ i][2], batch[i][3], is_weights[i] loss = self.get_q_value(_s, _a, _r, _s_, done)[0] self.memory.update(idxs[i], is_weight * loss) #@timecost def get_q_value(self, s, a, r, s_, done): p = self.model.predict([[s]]) old_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1) # 一樣有 double dqn p_next = self.model.predict([[s_]]) q = np.sum(np.multiply(np.vstack(p_next), np.array(self.z)), axis=1) p_d_next = self.dump_model.predict([[s_]]) next_action_idxs = np.argmax(q) # init m 值 m_prob = [np.zeros((1, self.atoms))] # action 後更新 m 值 if done: # Distribution collapses to a single point Tz = min(self.v_max, max(self.v_min, r)) bj = (Tz - self.v_min) / self.delta_z m_l, m_u = math.floor(bj), math.ceil(bj) m_prob[0][0][int(m_l)] += (m_u - bj) m_prob[0][0][int(m_u)] += (bj - m_l) else: for j in range(self.atoms): Tz = min(self.v_max, max(self.v_min, r + self.gamma * self.z[j])) bj = (Tz - self.v_min) / self.delta_z m_l, m_u = math.floor(bj), math.ceil(bj) m_prob[0][0][int( m_l)] += p_d_next[next_action_idxs][0][j] * (m_u - bj) m_prob[0][0][int( m_u)] += p_d_next[next_action_idxs][0][j] * (bj - m_l) # 更新後放回p,回去訓練 p[a][0][:] = m_prob[0][0][:] # 計算q估計 new_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1) #計算 error 給PER error = abs(old_q[a] - new_q[a]) return error, p def _loss(self, model, x, y): y_ = self.model(x) #loss = sum(sum(tf.nn.softmax_cross_entropy_with_logits(y, y_))) loss = tf.nn.softmax_cross_entropy_with_logits(y, y_) return loss def _grad(self, model, inputs, targets): with tf.GradientTape() as tape: loss_value = self._loss(self.model, inputs, targets) return loss_value, tape.gradient(loss_value, self.model.trainable_variables) def train(self, s, q): loss_value, grads = self._grad(self.model, s, q) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables), get_or_create_global_step())
def train(sess, env, args, actor, critic, actor_noise, desired_goal_dim, achieved_goal_dim, observation_dim): # Set path to save results tensorboard_dir = './' + args['env'] + '_' + args['name'] + '/train_' + datetime.now().strftime('%Y-%m-%d-%H') model_dir = './' + args['env'] + '_' + args['name'] + '/model' # add summary to tensorboard summary_ops, summary_vars = build_summaries() # initialize variables, create writer and saver sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph) # restore session if exists try: saver.restore(sess, os.path.join(model_dir, args['env'] + '_' + args['name'] + '.ckpt')) print('------------------------Continue--------------------------') except: print('----------------------New Training------------------------') # initialize target network weights and replay memory actor.update() critic.update() replay_memory = Memory(int(args['memory_size']), int(args['seed'])) # train in loop for i in range(int(args['episodes'])): # reset gym, get achieved_goal, desired_goal, state achieved_goal, desired_goal, s, s_prime = unpack(env.reset()) episode_reward = 0 episode_maximum_q = 0 for j in range(int(args['episode_length'])): # render episode if args['render']: env.render() # predict action and add noise a = actor.predict(np.reshape(s, (1, actor.state_dim))) a = a + actor_noise.get_noise() # play obs_next, reward, done, info = env.step(a[0]) achieved_goal, desired_goal, state_next, state_prime_next = unpack(obs_next) # add normal experience to memory replay_memory.add(np.reshape(s, (actor.state_dim,)), np.reshape(a, (actor.action_dim,)), reward, done, np.reshape(state_next, (actor.state_dim,))) # add hindsight experience to memory substitute_goal = achieved_goal.copy() substitute_reward = env.compute_reward(achieved_goal, substitute_goal, info) replay_memory.add(np.reshape(s_prime, (actor.state_dim,)), np.reshape(a, (actor.action_dim,)), substitute_reward, True, np.reshape(state_prime_next, (actor.state_dim,))) # start to train when there's enough experience if replay_memory.size() > int(args['batch_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replay_memory.sample_batch(int(args['batch_size'])) # find TD -- temporal difference # actor find target action a2_batch = actor.predict_target(s2_batch) # critic find target q q2_batch = critic.predict_target(s2_batch, a2_batch) # add a decay of q to reward if not done r_batch_discounted = [] for k in range(int(args['batch_size'])): if d_batch[k]: r_batch_discounted.append(r_batch[k]) else: r_batch_discounted.append(r_batch[k] + critic.gamma * q2_batch[k]) # train critic with state, action, and reward pred_q, _ = critic.train(s_batch, a_batch, np.reshape(r_batch_discounted, (int(args['batch_size']), 1))) # record maximum q episode_maximum_q += np.amax(pred_q) # actor find action a_outs = actor.predict(s_batch) # get comment from critic comment_gradients = critic.get_comment_gradients(s_batch, a_outs) # train actor with state and the comment gradients actor.train(s_batch, comment_gradients[0]) # Update target networks actor.update() critic.update() # record reward and move to next state episode_reward += reward s = state_next # if episode ends if done: # write summary to tensorboard summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_maximum_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() # print out results print('| Episode: {:d} | Reward: {:d} '.format(i, int(episode_reward))) # save model saver.save(sess, os.path.join(model_dir, args['env'] + '_' + args['name'] + '.ckpt')) break return
class Agent: def __init__(self, actions, gamma=0.7, e_greedy = 0.7): self.actions = actions # a list self.gamma = gamma self.epsilon = e_greedy self.lr = 0.01 self.count = 0 self.epochs = 50 self.bar = Progbar(self.epochs) self.epoch_loss_avg = tf.keras.metrics.Mean() self.batch_size = 100 self.state_size = 2 self.record_size = 200 # initial model include the hard-working one and the dump one. M = Build_Model(self.state_size, 16, len(actions)) self.model = M.build() self.dump_model = copy.copy(self.model) self.optimizer = tf.optimizers.Adam(lr = self.lr) # initial memory with sum tree self.capacity = 200 self.memory = Memory(self.capacity) def choose_action(self, s): # action selection if np.random.uniform() < self.epsilon: # choose best action state_action = self.model.predict([[s]])[0] action = np.argmax(state_action) #action = np.random.choice([i for i in range(len(state_action)) if state_action[i] == max(state_action)]) else: # choose random action action = np.random.choice(self.actions) return action def store(self, s, a, r, s_, done): self.memory.add(1, [s, a, r, s_, done]) self.count += 1 def learn(self): loss_record = [] batch, idxs, is_weight = self.memory.sample(self.batch_size) X_train = np.zeros((self.batch_size, self.state_size)) Y_train = [np.zeros(len(self.actions)) for i in range(self.batch_size)] for i in range(self.batch_size): _s, _a, _r, _s_, done_ = batch[i][0], batch[i][1], batch[i][2], batch[i][3], batch[i][4] q_list, loss = self.get_loss(_s, _a, _r, _s_, done_) loss_record.append(loss) X_train[i] = _s for i_ in range(len(self.actions)): Y_train[i][i_] = q_list[i_] # Train! print('-----------Training-----------') for i in range(self.epochs): self.train(self.model, X_train, Y_train) self.bar.update(i, values=[('loss', self.epoch_loss_avg.result().numpy())]) # update prioritized experience for i in range(self.batch_size): loss = loss_record[i] * is_weight[i] self.memory.update(idxs[i], loss) def get_loss(self, s, a, r, s_, done): # calculate q value q_list = self.model.predict([[s]])[0] q_predict = q_list[a] qvalue = self.dump_model.predict([[s_]])[0][np.argmax(q_list)] loss = r + self.gamma * qvalue - q_predict if done: q_target = r else: q_target = loss q_list[a] =r + self.gamma * qvalue return q_list, loss def _loss(self, model, x, y): x = np.array(x) y_ = model(x) loss = huber_loss(y, y_) return loss def _grad(self, model, inputs, targets): with tf.GradientTape() as tape: loss_value = self._loss(model, inputs, targets) self.epoch_loss_avg(loss_value) return tape.gradient(loss_value, self.model.trainable_variables) def train(self, model, s, q): grads = self._grad(model, s, q) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables), get_or_create_global_step())
class Agent: def __init__(self, config): self.config = config self.epsilon = self.config.explore_start print("start of epsilon is ", self.epsilon) self.brain = MModel(self.config.action_size, self.config.state_size[0], self.config.state_size[1], self.config.state_size[2]) self.memory = Memory(self.config.memory_size) self.num_actions = self.config.action_size self.decayStep = 0 #self.stacked_frames is 4 for now def act(self, StackOfImage): #input is 88,84,4 #input is the stack of images (1,84,84,4) So input data has a shape of (batch_size, height, width, depth), # it will return numpy , then #choose action to take based on episolon greedy, #actionsToTake = np.zeros([self.num_actions]) # action at t a_t[0,0] if random.random() <= self.epsilon: #randomly explore an action #print("----------Random Action----------") #print(self.epsilon) action_index = random.randrange(self.num_actions) # it will be 0,1 #actionsToTake[action_index]=1 #print(action_index) else: q = self.brain.predict( StackOfImage.reshape((1, *StackOfImage.shape))) #print(q) #self.display(q[0][0],q[0][1]) action_index = np.argmax(q) #actionsToTake[action_index]=1 #print(action_index) return action_index def exploreLess(self): self.decayStep += 1 #this method decay too fast,not gradient is not slowingdown # self.epsilon = max(self.config.explore_stop, self.epsilon * np.exp(-self.config.decay_rate*self.decayStep)) self.epsilon = self.config.explore_stop + ( self.config.explore_start - self.config.explore_stop) * np.exp( -self.config.decay_rate * self.decayStep) def remember(self, experience): self.memory.add(experience) #print("inside remember function",experience[0].shape) def replay(self): #replay means learn, it will decrease episolon also. self.exploreLess() targets = np.zeros((self.config.batch_size, self.config.action_size)) batch = self.memory.sample(self.config.batch_size) #print("batch size is ",batch[0][0].shape) states_mb = np.array([each[0] for each in batch], ndmin=3) # ndmin meaning is stack to the last dimention. from 64 4 84 84 to 64 84 84 4 actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_states_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) Qs_targets = self.getQvalue(states_mb) #print(next_states_mb.shape) Qs_nextState = self.getQvalue(next_states_mb) # 64*4 for i in range(0, len(batch)): terminal = dones_mb[i] if terminal: actionNumber = np.argmax(Qs_targets[i]) Qs_targets[i][actionNumber] = rewards_mb[i] else: actionNumber = np.argmax(Qs_targets[i]) Qs_targets[i][ actionNumber] = rewards_mb[i] + self.config.gamma * np.max( Qs_nextState[i]) # the bootstraping way to get Q value. #targets_np=np.array(target_Qs_batch)# change to numpy array loss = self.brain.train(states_mb, Qs_targets) def getQvalue(self, stackOfImage): return self.brain.predict(stackOfImage) def saveModel(self, name): self.brain.save(self.config.checkpoints + '/' + name + '.ckpt') def getEpsilon(self): return self.epsilon
class NoisyQ: def __init__(self, actions, gamma=0.1, e_greedy=0.9): self.actions = actions # a list self.gamma = gamma self.epsilon = e_greedy self.lr = 0.1 self.count = 0 self.epochs = 5 # initial model include the hard-working one and the dump one. self.m = Build_Model(1, 10, len(actions)) self.model = self.m.model self.dump_model = copy.copy(self.model) # initial memory with sum tree self.capacity = 200 self.memory = Memory(self.capacity) def choose_action(self, s): # action selection if np.random.uniform() < self.epsilon: # choose best action state_action = self.model.predict([s])[0] # Doulbe method action = np.random.choice([ i for i in range(len(state_action)) if state_action[i] == max(state_action) ]) else: # choose random action action = np.random.choice(self.actions) return action def learn(self, s, a, r, s_): batch_size = 100 record_size = 300 s, a, r, s_, q_list, loss = self.q_value_cal(s, a, r, s_) self.memory.add(loss, [s, a, r, s_, q_list]) self.count += 1 # train when do record_size times actions. if self.count % record_size == 0: batch, idxs, is_weight = self.memory.sample(batch_size) Train = copy.copy(batch) X_train = np.array(Train)[:, 0] Y_train = np.array([i for i in np.array(Train)[:, 4]]) print(X_train.shape) print(Y_train.shape) self.model.fit(X_train, Y_train, epochs=self.epochs) # update prioritized experience for i in range(batch_size): _s, _a, _r, _s_ = batch[i][0], batch[i][1], batch[i][2], batch[ i][3] loss = self.q_value_cal(_s, _a, _r, _s_)[5] self.memory.update(idxs[i], loss) def q_value_cal(self, s, a, r, s_): # calculate q value q_list = self.model.predict([s])[0] q_predict = q_list[a] qvalue = self.dump_model.predict([s_])[0][np.argmax(q_list)] #q_target = r + self.gamma * qvalue loss = qvalue - q_predict q_list[a] += r + self.gamma * qvalue - q_predict return s, a, r, s_, q_list, loss
class DoubleDQN: def __init__(self, state_shape, action_size, discount_rate=0.97, epsilon=1.0, epsilon_decay=0.98, mem_size=20001, batch_size=32, tau=0.05): self.state_shape = state_shape self.action_size = action_size self.epsilon = epsilon self.epsilon_min = 0.05 self.epsilon_decay = epsilon_decay # self.epsilon_speed = episodes * 2 self.discount_rate = discount_rate self.tau = tau self.memory = Memory(mem_size) self.batch_size = batch_size self.optimizer = Adam() self.model = self._build_model() self.target_model = self._build_model() def _build_model(self): inp = Input(self.state_shape) out = Dense(48, activation='relu')(inp) out = Dense(32, activation='relu')(out) out = Dense(self.action_size, activation='linear')(out) model = Model(inp, out) model.compile(loss="mse", optimizer=self.optimizer) return model def choose_action(self, state): prob = np.random.uniform(0, 1) if prob >= self.epsilon: return np.argmax(self.model.predict(np.array([state]))[0]) else: return np.random.randint(self.action_size) def update_epsilon(self, loop_counter): if self.epsilon > self.epsilon_min: # self.epsilon *= 1/(1+loop_counter/self.epsilon_speed) self.epsilon *= self.epsilon_decay def train(self): """ samples is a set of (S, A, R, S', done) """ samples = self.memory.sample(self.batch_size) states = tf.convert_to_tensor(np.array([sample[0] for sample in samples]), dtype='float32') actions = np.array([sample[1] for sample in samples]) rewards = np.array([sample[2] for sample in samples]) next_states = np.array([sample[3] for sample in samples]) done = np.array([sample[4] for sample in samples]) maxQ_next_actions_index = np.argmax(self.model.predict_on_batch(next_states), axis=1) target_next_Qvals = np.array([self.target_model.predict_on_batch(next_states)[i, maxQ_next_actions_index[i]] for i in range(len(samples))]) target_Qvals = rewards + self.discount_rate * done * target_next_Qvals out = self.model(states).numpy() for i in range(len(samples)): out[i][actions[i]] = target_Qvals[i] self.model.train_on_batch(states, out) # def train(self): # """ # samples is a set of (S, A, R, S', done) # """ # samples = self.memory.sample(self.batch_size) # # for i in range(len(samples)): # state = tf.convert_to_tensor(samples[i][0], dtype='float32') # action = samples[i][1] # reward = samples[i][2] # next_state = samples[i][3] # done = samples[i][4] # target_next_Qval = self.target_model.predict(next_state)[0, np.argmax(self.model.predict(next_state)[0])] # target_Qval = reward + self.discount_rate * done * target_next_Qval # with tf.GradientTape() as tape: # out = self.model(state) # y_train = out.numpy() # y_train[0][action] = target_Qval # loss = (out - y_train) ** 2 # grad = tape.gradient(loss, self.model.trainable_variables) # self.optimizer.apply_gradients(zip(grad, self.model.trainable_variables)) def remember(self, state, action, reward, next_state, done): self.memory.add([state, action, reward, next_state, done]) def update_target(self): weights = self.model.trainable_variables target_weights = self.target_model.trainable_variables for i in range(len(weights)): target_weights[i].assign(self.tau * weights[i] + (1 - self.tau) * target_weights[i]) def save(self, path): self.model.save(path + "/model.h5") self.target_model.save(path + "/target_model.h5") def load(self, path): self.model = load_model(path + "/model.h5") self.target_model = load_model(path + "/target_model.h5")